# Fake News Classifier

## Loading and Preprocessing the Data

In [1]:
# importing the required data science and NLP libraries
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.util import ngrams
import nltk
nltk.download('punkt')
nltk.download('wordnet')
import sys
from unicodedata import category
from gensim.parsing.preprocessing import remove_stopwords
from textblob import TextBlob

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
# reading in the real news corpus and fake news corpus
real = pd.read_csv('True.csv')
fake = pd.read_csv('Fake.csv')

In [3]:
# viewing the first few rows of the real news corpus
real.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [4]:
# viewing the first few rows of the fake news corpus
fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [5]:
# defining the target labels
real['target'] = 0
fake['target'] = 1

In [6]:
# viewing the updated dataframe
real.head()

Unnamed: 0,title,text,subject,date,target
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [7]:
# viewing the updated dataframe
fake.head()

Unnamed: 0,title,text,subject,date,target
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [8]:
# checking the shape of the dataframe with real news stories
real.shape

(21417, 5)

In [9]:
# checking the shape of the dataframe with fake news stories
fake.shape

(23481, 5)

### There are 21,417 real news stories and 23,481 fake news stories in the dataset. While the size of these classes are not exactly equal, the dataset is balanced enough that we can use accuracy as the performance metric. 

In [10]:
# checking a real news story
real_sample = real['text'][0]
real_sample

'WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense “discretionary” spending on programs that support educat

In [11]:
# creating a list of all unicode punctuation characters
# this method is preferable to "from string import punctuation" as the latter only imports the ASCII punctuation marks
# https://stackoverflow.com/questions/60983836/complete-set-of-punctuation-marks-for-python-not-just-ascii
chrs = (chr(i) for i in range(sys.maxunicode + 1))
punctuation = set(c for c in chrs if category(c).startswith("P"))
punctuation

{'!',
 '"',
 '#',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '_',
 '{',
 '}',
 '¡',
 '§',
 '«',
 '¶',
 '·',
 '»',
 '¿',
 ';',
 '·',
 '՚',
 '՛',
 '՜',
 '՝',
 '՞',
 '՟',
 '։',
 '֊',
 '־',
 '׀',
 '׃',
 '׆',
 '׳',
 '״',
 '؉',
 '؊',
 '،',
 '؍',
 '؛',
 '؞',
 '؟',
 '٪',
 '٫',
 '٬',
 '٭',
 '۔',
 '܀',
 '܁',
 '܂',
 '܃',
 '܄',
 '܅',
 '܆',
 '܇',
 '܈',
 '܉',
 '܊',
 '܋',
 '܌',
 '܍',
 '߷',
 '߸',
 '߹',
 '࠰',
 '࠱',
 '࠲',
 '࠳',
 '࠴',
 '࠵',
 '࠶',
 '࠷',
 '࠸',
 '࠹',
 '࠺',
 '࠻',
 '࠼',
 '࠽',
 '࠾',
 '࡞',
 '।',
 '॥',
 '॰',
 '৽',
 '੶',
 '૰',
 '಄',
 '෴',
 '๏',
 '๚',
 '๛',
 '༄',
 '༅',
 '༆',
 '༇',
 '༈',
 '༉',
 '༊',
 '་',
 '༌',
 '།',
 '༎',
 '༏',
 '༐',
 '༑',
 '༒',
 '༔',
 '༺',
 '༻',
 '༼',
 '༽',
 '྅',
 '࿐',
 '࿑',
 '࿒',
 '࿓',
 '࿔',
 '࿙',
 '࿚',
 '၊',
 '။',
 '၌',
 '၍',
 '၎',
 '၏',
 '჻',
 '፠',
 '፡',
 '።',
 '፣',
 '፤',
 '፥',
 '፦',
 '፧',
 '፨',
 '᐀',
 '᙭',
 '᙮',
 '᚛',
 '᚜',
 '᛫',
 '᛬',
 '᛭',
 '᜵',
 '᜶',
 '។',
 '៕',
 '៖',
 '៘',
 '៙',
 '៚',
 '᠀',
 '᠁',
 '᠂',
 '᠃',
 '᠄

## Exploratory Data Analysis

In [12]:
# creating a function to clean the text
def clean_text(text):
    # lowercase text
    text = text.lower()
    # remove punctuation
    for p in punctuation:
        text = text.replace(p, '')
    # remove stopwords
    text = remove_stopwords(text)
    # define custom stopwords
    stopwords = ['said', 'according', 'sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday']
    # perform word tokenization
    words = word_tokenize(text)
    # remove custom stopwords
    words = [word for word in words if word not in stopwords]
    return words

In [13]:
# creating a function to get the unique set of vocab words in a preprocessed text
def get_vocab(text):
    # calling the clean_text function
    text = clean_text(text)
    # lemmatize text
    lemmas = [lemmatizer.lemmatize(word) for word in text]
    # create vocab set
    vocab = sorted(list(set(lemmas)))
    # return vocab
    return vocab

In [14]:
# getting the vocab words from a real news story
real_vocab = get_vocab(real_sample)
real_vocab

['$',
 '10',
 '11',
 '15',
 '20',
 '2018',
 '30',
 '44',
 '6',
 '7',
 '81',
 'action',
 'add',
 'administration',
 'agreeing',
 'aid',
 'aide',
 'approach',
 'approve',
 'arrival',
 'assist',
 'assistance',
 'balloon',
 'battle',
 'begin',
 'big',
 'biggest',
 'bill',
 'billion',
 'boost',
 'border',
 'borrow',
 'bracing',
 'brought',
 'budget',
 'california',
 'call',
 'called',
 'campaign',
 'caucus',
 'cbs',
 'chairman',
 'change',
 'child',
 'childhood',
 'clean',
 'clear',
 'come',
 'congress',
 'congressional',
 'conservative',
 'control',
 'corporation',
 'country',
 'created',
 'crowley',
 'cut',
 'daca',
 'date',
 'debbie',
 'debt',
 'debtfinanced',
 'december',
 'defend',
 'deferred',
 'deficit',
 'democrat',
 'democratic',
 'deportation',
 'dingell',
 'disabled',
 'discretionary',
 'discus',
 'donald',
 'dont',
 'dreamer',
 'drew',
 'early',
 'economy',
 'education',
 'elderly',
 'election',
 'emergency',
 'entitlement',
 'environmental',
 'eventually',
 'exceeded',
 'exchan

In [15]:
# checking the number of unique vocab words in a preprocessed real news story
len(real_vocab)

276

In [16]:
# checking the number of characters in the same real news story
len(real_sample)

4659

In [17]:
# checking the number of unique vocab words out of number of characters
len(real_vocab) / len(real_sample)

0.0592401802962009

In [18]:
# getting a fake news story
fake_sample = fake['text'][0]

In [19]:
# getting the vocab from the preprocessed fake news story
fake_vocab = get_vocab(fake_sample)

In [20]:
# checking the number of unique vocab words in a preprocessed fake news story
len(fake_vocab)

159

In [21]:
# checking the number of characters in the same fake news story
len(fake_sample)

2893

In [22]:
# checking the number of unique vocab words out of number of characters
len(fake_vocab) / len(fake_sample)

0.054960248876598686

In [23]:
%%time
# applying the get_vocab function to the dataframe with real news stories
real['vocab'] = real['text'].apply(get_vocab)

CPU times: user 53.1 s, sys: 390 ms, total: 53.5 s
Wall time: 53.5 s


In [24]:
%%time
# applying the get_vocab function to the dataframe with fake news stories
fake['vocab'] = fake['text'].apply(get_vocab)

CPU times: user 55.1 s, sys: 455 ms, total: 55.6 s
Wall time: 55.5 s


In [25]:
# checking the updated dataframe
real

Unnamed: 0,title,text,subject,date,target,vocab
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0,"[$, 10, 11, 15, 20, 2018, 30, 44, 6, 7, 81, ac..."
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0,"[1, 2017, 2018, 21, 8, accept, accepting, acce..."
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0,"[2016, 2018, accusing, administration, adviser..."
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0,"[2016, 30, abdel, accused, action, adviser, ag..."
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0,"[$, 086, 115, 117590, 195, 2, 2014, 2015, 2016..."
...,...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",0,"[11, 12000, 15, 16, 2001, 4000, 8400, acknowle..."
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",0,"[2017, academic, access, article, asked, block..."
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",0,"[1917, 2012, 21, 23, 42, abandoned, alexander,..."
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",0,"[1054, 12, alongside, apart, believed, billion..."


In [26]:
# checking the updated dataframe
fake

Unnamed: 0,title,text,subject,date,target,vocab
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1,"[2016happy, 2016this, 2017do, 2017he, 2017here..."
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1,"[2016, abdel, administration, adviser, aide, a..."
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1,"[2017are, 2017he, 2017i, 2017is, 2017make, 201..."
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1,"[2017, 2017after, 2017donald, 2017it, 2017that..."
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1,"[acceptance, added, agreed, amen, analogy, ann..."
...,...,...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",1,"[10, 11, 2016, 21st, 21wire, able, act, action..."
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",1,"[$, 2013, 21st, 4, accept, access, ad, adverti..."
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",1,"[$, 100, 14, 15, 18, 1916, 1998, 20, 2006, 201..."
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",1,"[$, 13, 2003, 2012, 2015, 2016, 2016in, 21st, ..."


In [27]:
# getting the vocab size for news stories in both dataframes
real['vocab_size'] = real['vocab'].apply(len)
fake['vocab_size'] = fake['vocab'].apply(len)

In [28]:
# checking the updated dataframe
real

Unnamed: 0,title,text,subject,date,target,vocab,vocab_size
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0,"[$, 10, 11, 15, 20, 2018, 30, 44, 6, 7, 81, ac...",276
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0,"[1, 2017, 2018, 21, 8, accept, accepting, acce...",218
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0,"[2016, 2018, accusing, administration, adviser...",144
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0,"[2016, 30, abdel, accused, action, adviser, ag...",142
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0,"[$, 086, 115, 117590, 195, 2, 2014, 2015, 2016...",278
...,...,...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",0,"[11, 12000, 15, 16, 2001, 4000, 8400, acknowle...",180
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",0,"[2017, academic, access, article, asked, block...",52
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",0,"[1917, 2012, 21, 23, 42, abandoned, alexander,...",139
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",0,"[1054, 12, alongside, apart, believed, billion...",82


In [29]:
# checking the updated dataframe
fake

Unnamed: 0,title,text,subject,date,target,vocab,vocab_size
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1,"[2016happy, 2016this, 2017do, 2017he, 2017here...",159
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1,"[2016, abdel, administration, adviser, aide, a...",132
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1,"[2017are, 2017he, 2017i, 2017is, 2017make, 201...",215
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1,"[2017, 2017after, 2017donald, 2017it, 2017that...",156
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1,"[acceptance, added, agreed, amen, analogy, ann...",137
...,...,...,...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",1,"[10, 11, 2016, 21st, 21wire, able, act, action...",214
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",1,"[$, 2013, 21st, 4, accept, access, ad, adverti...",112
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",1,"[$, 100, 14, 15, 18, 1916, 1998, 20, 2006, 201...",1186
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",1,"[$, 13, 2003, 2012, 2015, 2016, 2016in, 21st, ...",184


In [30]:
# checking the average vocab size of the real news stories
# note: the number of words in a news story can vary, so this measure is not scaled
real['vocab_size'].mean()

143.8686090488864

In [31]:
# checking the average vocab size of the fake news stories
# note: the number of words in a news story can vary, so this measure is not scaled
fake['vocab_size'].mean()

150.39802393424472

In [32]:
# defining a function to get the number of total words in a text
def get_num_words(text):
    # remove punctuation
    for p in punctuation:
        text = text.replace(p, '')
    # split text
    words = text.split()
    # return number of words
    return len(words)

In [33]:
# checking the number of words in a real news story
get_num_words(real_sample)

745

In [34]:
# checking the number of words in a fake news story
get_num_words(fake_sample)

492

In [35]:
# creating a new feature of the percentage of unique vocab words out of the total number of words in each news story
real['vocab_percentage'] = real['vocab'].apply(len) / real['text'].apply(get_num_words)
fake['vocab_percentage'] = fake['vocab'].apply(len) / fake['text'].apply(get_num_words)

In [36]:
# checking the average diversity of vocab words in the dataframe with real news stories
real['vocab_percentage'].mean()

0.40240445558856386

In [37]:
# checking the average diversity of vocab words in the dataframe with fake news stories
fake['vocab_percentage'].mean()

0.3874380934584322

### The higher the vocab percentage, the more diverse vocab there is. Consider:

### Diverse sentence: "Colorless green ideas sleep furiously." (vocab percentage = 1.0)
### Simple sentence: "This is very, very good." (vocab percentage = 0.8)

### The vocab of the real news stories are only slightly higher on average than those of the fake news stories, likely making this particular measure not particularly useful as a feature. 

In [38]:
# creating a function to get the sentiment of a text
def get_sentiment(text):
  blob = TextBlob(text)
  return blob.sentiment.polarity

In [39]:
# creating a function to get the subjectivity of a text
def get_subjectivity(text):
  blob = TextBlob(text)
  return blob.sentiment.subjectivity

In [40]:
%%time
# applying the above two functions to both dataframes
real['sentiment'] = real['text'].apply(get_sentiment)
real['subjectivity'] = real['text'].apply(get_subjectivity)
fake['sentiment'] = fake['text'].apply(get_sentiment)
fake['subjectivity'] = fake['text'].apply(get_subjectivity)

CPU times: user 3min 39s, sys: 781 ms, total: 3min 40s
Wall time: 3min 40s


In [41]:
# checking the average sentiment of the real news stories
real['sentiment'].mean()

0.053532708516390845

In [42]:
# checking the average sentiment of the fake news stories
fake['sentiment'].mean()

0.05947378235243156

In [43]:
# checking the average subjectivity of the real news stories
real['subjectivity'].mean()

0.3617075431888403

In [44]:
# checking the average subjectivity of the fake news stories
fake['subjectivity'].mean()

0.43367391393690086

### These results are interesting, and perhaps a bit surprising. The sentiment of the real and fake news stories are similar, and actually the fake news stories are slightly more positive on average. The fake news stories are noticeably more subjective on average than their real counterparts, though, which is probably to be expected. 

## Train-Test Split

In [45]:
# creating a corpus of the real news stories in a list
real_corpus = real['text'].to_list()
real_corpus[0]

'WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense “discretionary” spending on programs that support educat

In [46]:
# creating a corpus of the fake news stories in a list
fake_corpus = fake['text'].to_list()
fake_corpus[0]

'Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.  2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America!  Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as you d expect.What kind of president sends a New Year s greeting like this despicable, petty, infantile gibberish? Only Trump! His lack of decency won t ev

In [47]:
# creating a combined corpus of real and fake news stories
corpus = real_corpus + fake_corpus

In [48]:
# creating a list of all the labels
real_y = real['target'].to_list()
fake_y = fake['target'].to_list()
y = real_y + fake_y

In [49]:
# creating the train and test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, random_state=42)

In [51]:
# checking the lengths of X_train, X_test, y_train, and y_test
print(len(X_train), len(X_test), len(y_train), len(y_test))

35918 8980 35918 8980


## TF-IDF Vectorizer

In [52]:
# initiating a TF-IDF vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [53]:
# fitting and transforming X_train with the TF-IDF vectorizer
X_train = vectorizer.fit_transform(X_train)

In [54]:
# checking the unique tokens from the vectorizer
vectorizer.get_feature_names()

['00',
 '000',
 '0000',
 '00000017',
 '00004',
 '000048',
 '000063',
 '00007',
 '000270',
 '00042',
 '0005',
 '0009',
 '000938',
 '000a',
 '000after',
 '000although',
 '000american',
 '000california',
 '000cases',
 '000cylvia',
 '000dillon000',
 '000ecuador',
 '000florida',
 '000georgia',
 '000have',
 '000illegal',
 '000illinois',
 '000in',
 '000jose',
 '000kyrgyzstan',
 '000m',
 '000michigan',
 '000new',
 '000oman',
 '000s',
 '000saudi',
 '000south',
 '000th',
 '000the',
 '000uterine',
 '001',
 '00106',
 '0011',
 '00155',
 '0018',
 '0019',
 '00193',
 '001romney',
 '001st',
 '002',
 '0020',
 '00240',
 '002singapore',
 '003',
 '004',
 '0040',
 '00458',
 '0047',
 '004saint',
 '005',
 '0050',
 '005380',
 '005930',
 '006',
 '00654',
 '00684',
 '007',
 '0076',
 '007kzman',
 '008',
 '00867',
 '009',
 '00am',
 '00c6j7capuhttps',
 '00o',
 '00pm',
 '00pme',
 '01',
 '010',
 '0100',
 '011',
 '0112',
 '012',
 '01233',
 '0129',
 '013',
 '014',
 '01494',
 '015',
 '01511',
 '01517',
 '016',
 '017',
 

In [55]:
# checking the shape of X_train
X_train.shape

(35918, 111440)

In [56]:
# vectorizing X_test
X_test = vectorizer.transform(X_test)

In [57]:
# checking the shape of X_test
X_test.shape

(8980, 111440)

## Approach 1: Naive Bayes

In [58]:
# initializing the Naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

In [59]:
# fitting the Naive Bayes classifier on the training dataset
mnb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [60]:
# getting the predictions from the Naive Bayes classifer on the test dataset
mnb_y_pred = mnb.predict(X_test)

In [61]:
# checking the distribution of predicted labels
pd.Series(mnb_y_pred).value_counts()

1    4716
0    4264
dtype: int64

In [62]:
# importing classification_report, which displays a variety of performance metrics
from sklearn.metrics import classification_report

In [63]:
# checking the performance of the Naive Bayes classifier
print(classification_report(y_test, mnb_y_pred))

              precision    recall  f1-score   support

           0       0.94      0.93      0.94      4330
           1       0.93      0.95      0.94      4650

    accuracy                           0.94      8980
   macro avg       0.94      0.94      0.94      8980
weighted avg       0.94      0.94      0.94      8980



### Already the accuracy is at 94%, which is fairly remarkable.

## Approach 2: Logistic Regression

In [64]:
# initializing the Logistic Regression classifier
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=42)

In [65]:
# fitting the Logistic Regression classifier on the training dataset
log_reg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [66]:
# getting the predictions from the Logistic Regression classifer on the test dataset
log_reg_y_pred = log_reg.predict(X_test)

In [67]:
# checking the performance of the Logistic Regression classifier
print(classification_report(y_test, log_reg_y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      4330
           1       0.99      0.99      0.99      4650

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



### Now the accuracy is at 99%, which is amazing. However, this raises the concern of potential overfitting. 

## Checking for Potential Overfitting: N-Grams

### Real or fake news stories might have repeating phrases, which could potentially lead to overfitting if an ML model is picking up on the repeating phrases rather than other features in the text. 

### To avoid this potential overfitting, we can check the most common unigrams, bigrams, trigrams, and 4-grams in the real news corpus and fake news corpus and see whether any of these n-grams need to be removed. 

In [68]:
# creating a function to get n-grams from a text
def get_ngrams(text, n):
    text = clean_text(text)
    if n < 1 or type(n) != int:
        raise ValueError('n must be an integer greater than 0')
    if n == 1:
        return text
    else:
        return list(ngrams(text, n))

In [69]:
# concatenating all the real news stories and fake news stories into single respective strings
# https://stackoverflow.com/questions/41400381/python-pandas-concatenate-a-series-of-strings-into-one-string
real_text = ' '.join(real['text'])
fake_text = ' '.join(fake['text'])

In [70]:
%%time
# extracting all the unigrams, bigrams, trigrams, and 4-grams from the real news string
real_unigrams = get_ngrams(real_text, 1) 
real_bigrams = get_ngrams(real_text, 2)
real_trigrams = get_ngrams(real_text, 3)
real_4grams = get_ngrams(real_text, 4)

CPU times: user 2min 44s, sys: 3.24 s, total: 2min 48s
Wall time: 2min 47s


In [71]:
%%time
# extracting all the unigrams, bigrams, trigrams, and 4-grams from the fake news string
fake_unigrams = get_ngrams(fake_text, 1)
fake_bigrams = get_ngrams(fake_text, 2)
fake_trigrams = get_ngrams(fake_text, 3)
fake_4grams = get_ngrams(fake_text, 4)

CPU times: user 1min 48s, sys: 2.87 s, total: 1min 51s
Wall time: 1min 51s


In [72]:
# checking the top unigrams in the real news stories
pd.Series(real_unigrams).value_counts()[:50]

s                 44875
trump             42599
reuters           28404
president         25548
state             18747
government        17980
states            17637
new               16786
house             16406
united            15572
republican        15292
people            15114
told              14244
washington        12143
trumps            11751
election          11504
party             11356
year              10499
donald            10445
campaign          10422
security          10098
percent            9948
$                  9898
north              9843
white              9442
court              9019
senate             8990
minister           8645
officials          8488
years              8270
democratic         8240
country            8235
foreign            8197
national           8183
including          8119
week               8073
presidential       8012
military           7944
china              7941
clinton            7860
law                7785
tax             

In [73]:
# checking the top bigrams in the real news stories
pd.Series(real_bigrams).value_counts()[:50]

(united, states)              12188
(white, house)                 8291
(donald, trump)                7974
(washington, reuters)          6492
(president, donald)            5929
(north, korea)                 5284
(new, york)                    4336
(prime, minister)              4118
(told, reuters)                3491
(islamic, state)               3367
(told, reporters)              3176
(president, barack)            2903
(supreme, court)               2382
(barack, obama)                2380
(united, nations)              2266
(house, representatives)       2259
(secretary, state)             2238
(hillary, clinton)             2189
(donald, trumps)               2158
(national, security)           2130
(reuters, president)           2088
(human, rights)                2079
(european, union)              1914
(presidential, election)       1877
(trump, administration)        1865
(saudi, arabia)                1860
(state, department)            1753
(foreign, minister)         

In [74]:
# checking the top trigrams in the real news stories
pd.Series(real_trigrams).value_counts()[:50]

(president, donald, trump)               4429
(president, barack, obama)               1993
(reuters, president, donald)             1562
(president, donald, trumps)              1428
(washington, reuters, president)         1347
(president, barack, obamas)               910
(north, korea, s)                         832
(president, vladimir, putin)              769
(nov, 8, election)                        768
(new, york, reuters)                      740
(secretary, state, rex)                   672
(state, rex, tillerson)                   661
(presidentelect, donald, trump)           642
(national, security, adviser)             614
(russian, president, vladimir)            611
(george, w, bush)                         600
(speaker, paul, ryan)                     588
(prime, minister, theresa)                584
(chancellor, angela, merkel)              558
(leader, mitch, mcconnell)                553
(democratic, president, barack)           547
(new, york, times)                

In [75]:
# checking the top 4-grams in the real news stories
pd.Series(real_4grams).value_counts()[:50]

(reuters, president, donald, trump)               1306
(washington, reuters, president, donald)          1086
(secretary, state, rex, tillerson)                 660
(russian, president, vladimir, putin)              596
(majority, leader, mitch, mcconnell)               431
(senate, majority, leader, mitch)                  406
(vice, president, mike, pence)                     391
(president, george, w, bush)                       369
(reuters, president, barack, obama)                352
(nov, 8, presidential, election)                   351
(democratic, president, barack, obama)             345
(president, donald, trump, s)                      341
(house, speaker, paul, ryan)                       311
(immediately, respond, request, comment)           299
(chinese, president, xi, jinping)                  298
(french, president, emmanuel, macron)              291
(british, prime, minister, theresa)                287
(presidential, candidate, donald, trump)           276
(german, c

In [76]:
# checking the top unigrams in the fake news stories
pd.Series(fake_unigrams).value_counts()[:50]

s              129446
trump           73422
t               40602
people          25941
president       25495
clinton         17957
obama           17760
like            17596
donald          17100
new             14135
news            14099
hillary         13510
white           12767
time            12689
state           12494
media           10980
house           10545
america         10523
campaign        10519
know            10256
american         9920
going            9741
$                9613
image            9572
states           9457
told             9100
republican       8976
right            8709
country          8640
government       8579
police           8527
way              8427
think            8319
years            8212
united           7966
election         7949
video            7810
political        7545
party            7440
black            7436
want             7283
women            7125
national         7064
republicans      7022
says             6933
world     

In [77]:
# checking the top bigrams in the fake news stories
pd.Series(fake_bigrams).value_counts()[:50]

(trump, s)                  13998
(donald, trump)             13076
(hillary, clinton)           6705
(white, house)               6284
(united, states)             6167
(new, york)                  4179
(president, obama)           3794
(president, trump)           3615
(obama, s)                   3472
(isn, t)                     3403
(clinton, s)                 3185
(fox, news)                  3148
(wasn, t)                    2547
(21st, century)              2225
(barack, obama)              2214
(won, t)                     2157
(donald, j)                  2154
(j, trump)                   2121
(century, wire)              1926
(t, know)                    1905
(america, s)                 1802
(fake, news)                 1768
(supreme, court)             1766
(trump, realdonaldtrump)     1694
(national, security)         1634
(obama, administration)      1616
(social, media)              1609
(t, want)                    1596
(law, enforcement)           1552
(secretary, st

In [78]:
# checking the top trigrams in the fake news stories
pd.Series(fake_trigrams).value_counts()[:50]

(donald, trump, s)                   2197
(donald, j, trump)                   2113
(21st, century, wire)                1902
(j, trump, realdonaldtrump)          1692
(new, york, times)                   1470
(black, lives, matter)               1224
(hillary, clinton, s)                1190
(news, 21st, century)                 941
(president, united, states)           901
(president, donald, trump)            877
(president, barack, obama)            856
(century, wire, says)                 727
(video, screen, capture)              724
(image, video, screen)                695
(president, obama, s)                 687
(president, trump, s)                 672
(trump, s, campaign)                  669
(images, donald, trump)               647
(new, york, city)                     604
(george, w, bush)                     564
(director, james, comey)              479
(fbi, director, james)                477
(21wire, subscribe, member)           461
(subscribe, member, 21wiretv)     

In [79]:
# checking the top 4-grams in the fake news stories
pd.Series(fake_4grams).value_counts()[:50]

(donald, j, trump, realdonaldtrump)                 1692
(news, 21st, century, wire)                          937
(21st, century, wire, says)                          707
(image, video, screen, capture)                      655
(fbi, director, james, comey)                        442
(21wire, subscribe, member, 21wiretv)                441
(member, 21wiretv, 21st, century)                    357
(21wiretv, 21st, century, wire)                      351
(filessupport, 21wire, subscribe, member)            318
(work, subscribing, member, 21wiretv)                278
(image, chip, somodevillagetty, images)              253
(filessupport, work, subscribing, member)            238
(alternate, current, radio, network)                 229
(image, alex, wonggetty, images)                     228
(president, donald, trump, s)                        216
(white, house, press, secretary)                     214
(current, radio, network, acr)                       213
(j, trump, realdonaldtrump, jan

## Approach 3: Naive Bayes with names of news outlets removed

In [80]:
# creating a copy of the original corpus
revised_corpus = corpus.copy()

In [81]:
# lowercasing the copied corpus
revised_corpus = [text.lower() for text in revised_corpus]

In [82]:
# removing certain terms from the revised corpus that may have previously caused some overfitting
revised_corpus = [text.replace('washington (reuters)', '') for text in revised_corpus]
revised_corpus = [text.replace('reuters', '') for text in revised_corpus]
revised_corpus = [text.replace('21st century wire', '') for text in revised_corpus]
revised_corpus = [text.replace('21wiretv', '') for text in revised_corpus]

In [83]:
# splitting the revised corpus into training and test datasets 
X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(revised_corpus, y, test_size=0.2, random_state=42)

In [84]:
# fitting and transforming the new X_train with the TF-IDF vectorizer
X_TRAIN = vectorizer.fit_transform(X_TRAIN)

In [85]:
# vectorizing the new X_test
X_TEST = vectorizer.transform(X_TEST)

In [86]:
# retraining the Naive Bayes classifier on the new training dataset
mnb.fit(X_TRAIN, Y_TRAIN)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [87]:
# getting the updated predictions from the Naive Bayes classifier
MNB_Y_PRED = mnb.predict(X_TEST)

In [88]:
# checking the distribution of updated predictions from the Naive Bayes classifier
pd.Series(MNB_Y_PRED).value_counts()

1    4735
0    4245
dtype: int64

In [89]:
# checking the updated performance of the Naive Bayes classifier on the revised test dataset
print(classification_report(Y_TEST, MNB_Y_PRED))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93      4330
           1       0.93      0.95      0.94      4650

    accuracy                           0.94      8980
   macro avg       0.94      0.93      0.93      8980
weighted avg       0.94      0.94      0.94      8980



### The accuracy of the Naive Bayes classifier is still at 94%, suggesting that there wasn't much overfitting occurring, at least on the frequent terms deleted such as "Reuters" and "21st Century Wire." 

## Approach 4: Logistic Regression with names of news outlets removed

In [90]:
# fitting the Logistic Regression classifier on the training dataset from the revised corpus
log_reg.fit(X_TRAIN, Y_TRAIN)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [91]:
# getting the updated predictions from the Logistic Regression classifier
LOG_REG_Y_PRED = log_reg.predict(X_TEST)

In [92]:
# checking the updated performance of the Logistic Regression classifier on the revised test dataset
print(classification_report(Y_TEST, LOG_REG_Y_PRED))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      4330
           1       0.99      0.98      0.98      4650

    accuracy                           0.98      8980
   macro avg       0.98      0.98      0.98      8980
weighted avg       0.98      0.98      0.98      8980



### Here the Logistic Regression classifier has slightly dropped in accuracy from 99% to 98% after terms such as "Reuters" and "21st Century Wire" had been deleted, suggesting that some, but not much, overfitting had been occurring previously. 

## Approach 5: Text Classification with BERT via simpletransformers

In [93]:
%%capture
# installing the simpletransformers Python library
!pip install simpletransformers

In [94]:
# importing the simpletransformers library
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging

In [95]:
# configuring the simpletransformers library based on its documentation
# https://simpletransformers.ai/docs/binary-classification/
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [96]:
# getting the training and test datasets split from the revised corpus
X_train, X_test, y_train, y_test = train_test_split(revised_corpus, y, test_size=0.2, random_state=42)

In [97]:
# creating a dataframe of the text and labels from the training dataset
train_df = pd.DataFrame(columns=['text', 'labels'])
train_df['text'] = X_train
train_df['labels'] = y_train
train_df

Unnamed: 0,text,labels
0,this story is from 2006 and is the first in a ...,1
1,paris () - the leaders of the european union s...,0
2,star wars icon mark hamill previously mocked d...,1
3,the whiner-in-chief struck again with yet anot...,1
4,donald trump doesn t have much foreign policy ...,1
...,...,...
35913,abuja () - the united states has formally agre...,0
35914,tune in to the alternate current radio network...,1
35915,i m convinced the freedom from religion group...,1
35916,- the republican tax plan unveiled on thursda...,0


In [98]:
# creating a dataframe of the text and labels from the test dataset
eval_df = pd.DataFrame(columns=['text', 'labels'])
eval_df['text'] = X_test
eval_df['labels'] = y_test
eval_df

Unnamed: 0,text,labels
0,"donald trump s white house is in chaos, and th...",1
1,now that donald trump is the presumptive gop n...,1
2,mike pence is a huge homophobe. he supports ex...,1
3,san francisco () - california attorney general...,0
4,twisted reasoning is all that comes from pelos...,1
...,...,...
8975,hillary s over-the-top plan to tax the consume...,1
8976,- president donald trump on wednesday propose...,0
8977,donald trump s white house is the leakiest in ...,1
8978,ankara () - iran rejected as wrong some brit...,0


In [99]:
%%time
# training and evaluating a BERT model based on the simpletransformer library's documentation
# https://simpletransformers.ai/docs/binary-classification/
model_args = ClassificationArgs(num_train_epochs=2, overwrite_output_dir=True)
model = ClassificationModel("bert", "bert-base-uncased", args=model_args)
model.train_model(train_df)
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

DEBUG:filelock:Attempting to acquire lock 140175421219152 on /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e.lock
DEBUG:filelock:Lock 140175421219152 acquired on /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e.lock


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140175421219152 on /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e.lock
DEBUG:filelock:Lock 140175421219152 released on /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e.lock
DEBUG:filelock:Attempting to acquire lock 140171250408464 on /root/.cache/huggingface/transformers/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f.lock
DEBUG:filelock:Lock 140171250408464 acquired on /root/.cache/huggingface/transformers/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f.lock


Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140171250408464 on /root/.cache/huggingface/transformers/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f.lock
DEBUG:filelock:Lock 140171250408464 released on /root/.cache/huggingface/transformers/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f.lock
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or wi

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140164978981968 on /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock
DEBUG:filelock:Lock 140164978981968 released on /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock
DEBUG:filelock:Attempting to acquire lock 140165012025488 on /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4.lock
DEBUG:filelock:Lock 140165012025488 acquired on /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4.lock


Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140165012025488 on /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4.lock
DEBUG:filelock:Lock 140165012025488 released on /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4.lock
DEBUG:filelock:Attempting to acquire lock 140164978499280 on /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79.lock
DEBUG:filelock:Lock 140164978499280 acquired on /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79.lock


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140164978499280 on /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79.lock
DEBUG:filelock:Lock 140164978499280 released on /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79.lock
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/35918 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_128_2_2


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/4490 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm


Running Epoch 1 of 2:   0%|          | 0/4490 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/8980 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_128_2_2


Running Evaluation:   0%|          | 0/1123 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.9997770227644949, 'tp': 4650, 'tn': 4329, 'fp': 1, 'fn': 0, 'auroc': 0.999999702004023, 'auprc': 0.9999997224316378, 'eval_loss': 0.0010669507145987806}


CPU times: user 1h 34min 43s, sys: 36min 26s, total: 2h 11min 10s
Wall time: 2h 12min 31s


In [100]:
# checking the performance of the BERT model
result

{'auprc': 0.9999997224316378,
 'auroc': 0.999999702004023,
 'eval_loss': 0.0010669507145987806,
 'fn': 0,
 'fp': 1,
 'mcc': 0.9997770227644949,
 'tn': 4329,
 'tp': 4650}

In [101]:
# checking the model outputs
model_outputs

array([[-6.16796875,  4.984375  ],
       [-6.19140625,  5.0078125 ],
       [-6.16796875,  4.98828125],
       ...,
       [-6.12109375,  4.9609375 ],
       [ 6.21484375, -5.1796875 ],
       [ 6.21484375, -5.171875  ]])

In [102]:
# checking the wrong predictions (empty list)
wrong_predictions

[]

### Here BERT achieved 99% accuracy on the revised corpus that had terms such as "Reuters" and "21st Century Wire" deleted to prevent potential overfitting. This is higer than the performance of the Logistic Regression (98%) and Naive Bayes (94%) classifiers. Of course, it took a couple hours to train the BERT model, but the better performance of the model and ability to feed the raw text into the model without preprocessing makes the training time worth it! 