In [1]:
#Import data into respective text or label data
import pandas as pd
data2 = pd.read_csv("faketrainer.csv", header = None)
textData = data2[3]
textData = textData.iloc[1:]
labelData = data2[4]
labelData = labelData.iloc[1:]
print(textData)
print(labelData)

1        House Dem Aide: We Didn’t Even See Comey’s Let...
2        Ever get the feeling your life circles the rou...
3        Why the Truth Might Get You Fired October 29, ...
4        Videos 15 Civilians Killed In Single US Airstr...
5        Print \nAn Iranian woman has been sentenced to...
6        In these trying times, Jackie Mason is the Voi...
7        Ever wonder how Britain’s most iconic pop pian...
8        PARIS  —   France chose an idealistic, traditi...
9        Donald J. Trump is scheduled to make a highly ...
10       A week before Michael T. Flynn resigned as nat...
11       Organizing for Action, the activist group that...
12       The BBC produced spoof on the “Real Housewives...
13       The mystery surrounding The Third Reich and Na...
14       Clinton Campaign Demands FBI Affirm Trump's Ru...
15       Yes, There Are Paid Government Trolls On Socia...
16       Guillermo Barros Schelotto was not the first A...
17       The scandal engulfing Wells Fargo toppled its .

In [2]:
#Import and download needed nltk files
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ethanlynagh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ethanlynagh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#Format text to remove numbers, extra spaces, trailing spaces, and NaN values
textData = textData.str.lower()
textData = textData.str.replace(r'[^\w\d\s]', ' ')
textData = textData.str.replace(r'\s+', ' ')
textData = textData.str.replace(r'^\s+|\s+?$', '')
textData = textData.str.replace(r'\d+(\.\d+)?', 'numbr')
textData = textData.fillna("")
print(textData)

1        house dem aide we didn t even see comey s lett...
2        ever get the feeling your life circles the rou...
3        why the truth might get you fired october numb...
4        videos numbr civilians killed in single us air...
5        print an iranian woman has been sentenced to s...
6        in these trying times jackie mason is the voic...
7        ever wonder how britain s most iconic pop pian...
8        paris france chose an idealistic traditional c...
9        donald j trump is scheduled to make a highly a...
10       a week before michael t flynn resigned as nati...
11       organizing for action the activist group that ...
12       the bbc produced spoof on the real housewives ...
13       the mystery surrounding the third reich and na...
14       clinton campaign demands fbi affirm trump s ru...
15       yes there are paid government trolls on social...
16       guillermo barros schelotto was not the first a...
17       the scandal engulfing wells fargo toppled its .

In [4]:
from nltk.corpus import stopwords

# remove stop words from text messages

stop_words = set(stopwords.words('english'))

textData = textData.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))
print(textData)

1        house dem aide even see comey letter jason cha...
2        ever get feeling life circles roundabout rathe...
3        truth might get fired october numbr numbr tens...
4        videos numbr civilians killed single us airstr...
5        print iranian woman sentenced six years prison...
6        trying times jackie mason voice reason week ex...
7        ever wonder britain iconic pop pianist gets lo...
8        paris france chose idealistic traditional cand...
9        donald j trump scheduled make highly anticipat...
10       week michael flynn resigned national security ...
11       organizing action activist group morphed barac...
12       bbc produced spoof real housewives tv programm...
13       mystery surrounding third reich nazi germany s...
14       clinton campaign demands fbi affirm trump russ...
15       yes paid government trolls social media blogs ...
16       guillermo barros schelotto first argentine pla...
17       scandal engulfing wells fargo toppled chairman.

In [5]:
from nltk.tokenize import word_tokenize

# create bag-of-words
all_words = []

for message in textData:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [6]:
# Print number of words and the most common
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(15)))

Number of words: 176701
Most common words: [('numbr', 212088), ('said', 80054), ('mr', 66287), ('trump', 56258), ('one', 38651), ('would', 37358), ('people', 36540), ('new', 30056), ('clinton', 27399), ('like', 26341), ('also', 25448), ('president', 25337), ('time', 21114), ('state', 20389), ('us', 20150)]


In [7]:
#Set features to be the 7,500 most common words
word_features = list(all_words.keys())[:7500]

In [8]:
#Define function to count occurence of feature words in article
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features

features = find_features(textData[1])
for key, value in features.items():
    if value == True:
        print(key)

house
dem
aide
even
see
comey
letter
jason
chaffetz
tweeted
darrell
lucus
october
numbr
subscribe
stump
american
fork
utah
image
courtesy
michael
jolley
available
creative
commons
license
apologies
keith
olbermann
doubt
worst
person
world
week
fbi
director
james
according
democratic
looks
like
also
know
second
well
turns
sent
infamous
announcing
looking
emails
may
related
hillary
clinton
email
server
ranking
democrats
relevant
committees
hear
found
via
tweet
one
republican
committee
chairmen
notified
members
intelligence
judiciary
oversight
agency
reviewing
recently
discovered
order
contained
classified
information
long
went
chairman
set
political
ablaze
dir
informed
learned
existence
appear
pertinent
investigation
case
reopened
jasoninthehouse
course
actually
saying
light
unrelated
anthony
weiner
sexting
teenager
apparently
little
things
facts
matter
already
vowed
initiate
raft
investigations
wins
least
two
years
worth
possibly
entire
term
thought
work
resulting
briefly
roiled
nation


In [9]:
#Import numpy
import numpy as np

In [10]:
from sklearn.preprocessing import LabelEncoder

# convert class labels to binary values, 0 = real, 1 = fake
encoder = LabelEncoder()
Y = encoder.fit_transform(labelData)

print(Y[:10])

[1 0 1 1 1 0 1 0 0 0]


In [11]:
# Do it for all the messages
texts = list(zip(textData, Y))

# define a seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(texts)

# call find_features function for each article text
featuresets = [(find_features(text), label) for (text, label) in texts]

In [12]:
from sklearn import model_selection

# split the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)

In [13]:
print(len(training))
print(len(testing))

15600
5200


In [14]:
# use sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# train the linear model on the training data
model.train(training)

# and test on the testing dataset
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))


SVC Accuracy: 94.38461538461539


In [15]:
# use sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC

modelsig = SklearnClassifier(SVC(kernel = 'sigmoid'))

# train the sigmoid model on the training data
modelsig.train(training)

# and test on the testing dataset
accuracysig = nltk.classify.accuracy(modelsig, testing)*100
print("SVC sig Accuracy: {}".format(accuracy))

SVC sig Accuracy: 94.38461538461539


In [16]:
#Creat classifier version to get insight of what words carry the most weight
classifier = nltk.NaiveBayesClassifier.train(training)

In [17]:
print(nltk.classify.accuracy(classifier, testing))

0.7357692307692307


In [18]:
classifier.show_most_informative_features(15)

Most Informative Features
                     pre = True                1 : 0      =    159.8 : 1.0
                    anti = True                1 : 0      =     89.8 : 1.0
                     non = True                1 : 0      =     82.0 : 1.0
               jeff_poor = True                0 : 1      =     54.5 : 1.0
                  neocon = True                1 : 0      =     49.5 : 1.0
                     www = True                1 : 0      =     35.7 : 1.0
                    ussr = True                1 : 0      =     32.9 : 1.0
                     mid = True                1 : 0      =     32.7 : 1.0
                     tex = True                0 : 1      =     30.7 : 1.0
                     gmo = True                1 : 0      =     29.6 : 1.0
                 janeiro = True                0 : 1      =     27.4 : 1.0
                     pam = True                0 : 1      =     23.8 : 1.0
               zerohedge = True                1 : 0      =     22.9 : 1.0

In [19]:
# make class label prediction for testing set
txt_features, labels = zip(*testing)

prediction = model.classify_many(txt_features)

In [20]:
# print a confusion matrix and a classification report
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
x = classification_report(labels, prediction)
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['real', 'fake']],
    columns = [['predicted', 'predicted'], ['real', 'fake']])

             precision    recall  f1-score   support

          0       0.94      0.94      0.94      2599
          1       0.94      0.95      0.94      2601

avg / total       0.94      0.94      0.94      5200



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,real,fake
actual,real,2450,149
actual,fake,143,2458


In [22]:
# import secondary data to test with
data3 = pd.read_csv("fake_or_real_news.csv", header = None)
textData3 = data3[2]
textData3 = textData3.iloc[1:]
labelData3 = data3[3]
labelData3 = labelData3.iloc[1:]
print(textData3)
print(labelData3)

1       Daniel Greenfield, a Shillman Journalism Fello...
2       Google Pinterest Digg Linkedin Reddit Stumbleu...
3       U.S. Secretary of State John F. Kerry said Mon...
4       — Kaydee King (@KaydeeKing) November 9, 2016 T...
5       It's primary day in New York and front-runners...
6         \nI’m not an immigrant, but my grandparents ...
7       Share This Baylee Luciani (left), Screenshot o...
8       A Czech stockbroker who saved more than 650 Je...
9       Hillary Clinton and Donald Trump made some ina...
10      Iranian negotiators reportedly have made a las...
11      CEDAR RAPIDS, Iowa — “I had one of the most wo...
12      Donald Trump’s organizational problems have go...
13      Click Here To Learn More About Alexandra's Per...
14      October 31, 2016 at 4:52 am \nPretty factual e...
15      Killing Obama administration rules, dismantlin...
16      As more women move into high offices, they oft...
17      Shocking! Michele Obama & Hillary Caught Glamo...
18      0 \nHi

In [23]:
# process the data in the same way
textData3 = textData3.str.lower()
textData3 = textData3.str.replace(r'[^\w\d\s]', ' ')
textData3 = textData3.str.replace(r'\s+', ' ')
textData3 = textData3.str.replace(r'^\s+|\s+?$', '')
textData3 = textData3.str.replace(r'\d+(\.\d+)?', 'numbr')
textData3 = textData3.fillna("")
print(textData3)

1       daniel greenfield a shillman journalism fellow...
2       google pinterest digg linkedin reddit stumbleu...
3       u s secretary of state john f kerry said monda...
4       kaydee king kaydeeking november numbr numbr th...
5       it s primary day in new york and front runners...
6       i m not an immigrant but my grandparents are m...
7       share this baylee luciani left screenshot of w...
8       a czech stockbroker who saved more than numbr ...
9       hillary clinton and donald trump made some ina...
10      iranian negotiators reportedly have made a las...
11      cedar rapids iowa i had one of the most wonder...
12      donald trump s organizational problems have go...
13      click here to learn more about alexandra s per...
14      october numbr numbr at numbr numbr am pretty f...
15      killing obama administration rules dismantling...
16      as more women move into high offices they ofte...
17      shocking michele obama hillary caught glamoriz...
18      numbr 

In [24]:
from nltk.corpus import stopwords

# remove stop words from text messages

stop_words3 = set(stopwords.words('english'))

textData3 = textData3.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words3))
print(textData3)

1       daniel greenfield shillman journalism fellow f...
2       google pinterest digg linkedin reddit stumbleu...
3       u secretary state john f kerry said monday sto...
4       kaydee king kaydeeking november numbr numbr le...
5       primary day new york front runners hillary cli...
6       immigrant grandparents numbr years ago arrived...
7       share baylee luciani left screenshot baylee ca...
8       czech stockbroker saved numbr jewish children ...
9       hillary clinton donald trump made inaccurate c...
10      iranian negotiators reportedly made last ditch...
11      cedar rapids iowa one wonderful rallies entire...
12      donald trump organizational problems gone bad ...
13      click learn alexandra personalized essences ps...
14      october numbr numbr numbr numbr pretty factual...
15      killing obama administration rules dismantling...
16      women move high offices often bring style appr...
17      shocking michele obama hillary caught glamoriz...
18      numbr 

In [25]:
from nltk.tokenize import word_tokenize

# create bag-of-words
all_words3 = []

for message in textData3:
    words3 = word_tokenize(message)
    for w in words3:
        all_words3.append(w)
        
all_words3 = nltk.FreqDist(all_words3)
print('Number of words: {}'.format(len(all_words3)))
print('Most common words: {}'.format(all_words3.most_common(15)))

Number of words: 65589
Most common words: [('numbr', 67960), ('trump', 22437), ('said', 21212), ('clinton', 17503), ('would', 12781), ('one', 11878), ('people', 11750), ('new', 9331), ('state', 9211), ('president', 8844), ('obama', 8234), ('also', 8225), ('us', 7866), ('campaign', 7719), ('like', 7248)]


In [26]:
labelData3 = data3[3]
labelData3 = labelData3.iloc[1:]
print(labelData3)

1       FAKE
2       FAKE
3       REAL
4       FAKE
5       REAL
6       FAKE
7       FAKE
8       REAL
9       REAL
10      REAL
11      REAL
12      REAL
13      FAKE
14      FAKE
15      REAL
16      REAL
17      FAKE
18      FAKE
19      REAL
20      REAL
21      REAL
22      FAKE
23      REAL
24      REAL
25      FAKE
26      REAL
27      REAL
28      REAL
29      REAL
30      FAKE
        ... 
6306    FAKE
6307    FAKE
6308    FAKE
6309    FAKE
6310    REAL
6311    REAL
6312    REAL
6313    FAKE
6314    FAKE
6315    FAKE
6316    REAL
6317    REAL
6318    FAKE
6319    FAKE
6320    REAL
6321    FAKE
6322    FAKE
6323    FAKE
6324    REAL
6325    REAL
6326    FAKE
6327    FAKE
6328    REAL
6329    FAKE
6330    FAKE
6331    REAL
6332    FAKE
6333    FAKE
6334    REAL
6335    REAL
Name: 3, Length: 6335, dtype: object


In [27]:
from sklearn.preprocessing import LabelEncoder

labelData3 = labelData3.replace(['FAKE', 'REAL'], [1, 0]) 

print(labelData3)
# convert class labels to binary values, 0 = ham and 1 = spam
encoder = LabelEncoder()
Y3 = encoder.fit_transform(labelData3)

print(Y3[:10])

1       1
2       1
3       0
4       1
5       0
6       1
7       1
8       0
9       0
10      0
11      0
12      0
13      1
14      1
15      0
16      0
17      1
18      1
19      0
20      0
21      0
22      1
23      0
24      0
25      1
26      0
27      0
28      0
29      0
30      1
       ..
6306    1
6307    1
6308    1
6309    1
6310    0
6311    0
6312    0
6313    1
6314    1
6315    1
6316    0
6317    0
6318    1
6319    1
6320    0
6321    1
6322    1
6323    1
6324    0
6325    0
6326    1
6327    1
6328    0
6329    1
6330    1
6331    0
6332    1
6333    1
6334    0
6335    0
Name: 3, Length: 6335, dtype: int64
[1 1 0 1 0 1 1 0 0 0]


In [28]:
# Now lets do it for all the messages
texts3 = list(zip(textData3, Y3))

# define a seed for reproducibility
seed3 = 1
np.random.seed = seed3
np.random.shuffle(texts3)

# call find_features function for each SMS message
featuresets3 = [(find_features(text3), label3) for (text3, label3) in texts3]

In [29]:
accuracy = nltk.classify.accuracy(model, featuresets3)*100

In [30]:
# Accuracy on the outside data set
print("SVC Accuracy: {}".format(accuracy))

SVC Accuracy: 77.50591949486977


In [52]:
#Define function to test single articles
def isPredictCorrect(texter, faker):
    texts = texter
    texts = texts.lower()
    texts = texts.replace(r'[^\w\d\s]', ' ')
    texts = texts.replace(r'\s+', ' ')
    texts = texts.replace(r'^\s+|\s+?$', '')
    texts = texts.replace(r'\d+(\.\d+)?', 'numbr')
    predic = [(find_features(texts), faker)]
    accuracy = nltk.classify.accuracy(model, predic)*100
    if(faker == 0):
        print("Guessing real article")
    if(faker == 1):
        print("Guessing fake article")
    print("SVC Accuracy: {}".format(accuracy))

In [53]:
# testing single article function
ahh = "Nested on the servers’ motherboards, the testers found a tiny microchip, not much bigger than a grain of rice, that wasn’t part of the boards’ original design. Amazon reported the discovery to U.S. authorities, sending a shudder through the intelligence community. Elemental’s servers could be found in Department of Defense data centers, the CIA’s drone operations, and the onboard networks of Navy warships. And Elemental was just one of hundreds of Supermicro customers."
isPredictCorrect(ahh, 1)

Guessing fake article
SVC Accuracy: 100.0
