In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score, confusion_matrix
from keras.layers import LSTM, Dense

In [2]:
data= pd.read_csv('news.csv')
data.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [3]:
data=data.drop('text',axis=1)
data.head(5)
 

Unnamed: 0.1,Unnamed: 0,title,label
0,8476,You Can Smell Hillary’s Fear,FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,FAKE
4,875,The Battle of New York: Why This Primary Matters,REAL


In [4]:
#data=data.drop('text',axis=1)

In [5]:
data = data.rename(columns={'title':'text'})
data.head(5)


Unnamed: 0.1,Unnamed: 0,text,label
0,8476,You Can Smell Hillary’s Fear,FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,FAKE
4,875,The Battle of New York: Why This Primary Matters,REAL


In [6]:
import string
def rem_punct(text):
        wo= [word for word in text if word not in string.punctuation]
        st= ''.join(wo)
        return st

data['text_punc'] = data['text'].apply(lambda x: rem_punct(x))
data.head(5)


Unnamed: 0.1,Unnamed: 0,text,label,text_punc
0,8476,You Can Smell Hillary’s Fear,FAKE,You Can Smell Hillary’s Fear
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,FAKE,Watch The Exact Moment Paul Ryan Committed Pol...
2,3608,Kerry to go to Paris in gesture of sympathy,REAL,Kerry to go to Paris in gesture of sympathy
3,10142,Bernie supporters on Twitter erupt in anger ag...,FAKE,Bernie supporters on Twitter erupt in anger ag...
4,875,The Battle of New York: Why This Primary Matters,REAL,The Battle of New York Why This Primary Matters


In [7]:
import re
def tokenize(text):
    s=re.split("\W+",text)
    return s


data['text_tokens']=data['text_punc'].apply(lambda x: tokenize(x.lower()))
data.head(5)

Unnamed: 0.1,Unnamed: 0,text,label,text_punc,text_tokens
0,8476,You Can Smell Hillary’s Fear,FAKE,You Can Smell Hillary’s Fear,"[you, can, smell, hillary, s, fear]"
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,FAKE,Watch The Exact Moment Paul Ryan Committed Pol...,"[watch, the, exact, moment, paul, ryan, commit..."
2,3608,Kerry to go to Paris in gesture of sympathy,REAL,Kerry to go to Paris in gesture of sympathy,"[kerry, to, go, to, paris, in, gesture, of, sy..."
3,10142,Bernie supporters on Twitter erupt in anger ag...,FAKE,Bernie supporters on Twitter erupt in anger ag...,"[bernie, supporters, on, twitter, erupt, in, a..."
4,875,The Battle of New York: Why This Primary Matters,REAL,The Battle of New York Why This Primary Matters,"[the, battle, of, new, york, why, this, primar..."


In [8]:

from nltk.corpus import stopwords

In [9]:
def stopword_remove(text):
    text = [word for word in text if word not in stopwords.words('english')]
    return text

data['text_stopword_NA'] = data['text_tokens'].apply(lambda x: stopword_remove(x))
data.head(5)

Unnamed: 0.1,Unnamed: 0,text,label,text_punc,text_tokens,text_stopword_NA
0,8476,You Can Smell Hillary’s Fear,FAKE,You Can Smell Hillary’s Fear,"[you, can, smell, hillary, s, fear]","[smell, hillary, fear]"
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,FAKE,Watch The Exact Moment Paul Ryan Committed Pol...,"[watch, the, exact, moment, paul, ryan, commit...","[watch, exact, moment, paul, ryan, committed, ..."
2,3608,Kerry to go to Paris in gesture of sympathy,REAL,Kerry to go to Paris in gesture of sympathy,"[kerry, to, go, to, paris, in, gesture, of, sy...","[kerry, go, paris, gesture, sympathy]"
3,10142,Bernie supporters on Twitter erupt in anger ag...,FAKE,Bernie supporters on Twitter erupt in anger ag...,"[bernie, supporters, on, twitter, erupt, in, a...","[bernie, supporters, twitter, erupt, anger, dn..."
4,875,The Battle of New York: Why This Primary Matters,REAL,The Battle of New York Why This Primary Matters,"[the, battle, of, new, york, why, this, primar...","[battle, new, york, primary, matters]"


# Lemmitize/Stemming

In [10]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def lemmitize(text):
    text = [ps.stem(word) for word in text]
    text = ' '.join(text)
    return text

data['text_lemm'] = data['text_stopword_NA'].apply(lambda x:lemmitize(x))
data.head(5)

Unnamed: 0.1,Unnamed: 0,text,label,text_punc,text_tokens,text_stopword_NA,text_lemm
0,8476,You Can Smell Hillary’s Fear,FAKE,You Can Smell Hillary’s Fear,"[you, can, smell, hillary, s, fear]","[smell, hillary, fear]",smell hillari fear
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,FAKE,Watch The Exact Moment Paul Ryan Committed Pol...,"[watch, the, exact, moment, paul, ryan, commit...","[watch, exact, moment, paul, ryan, committed, ...",watch exact moment paul ryan commit polit suic...
2,3608,Kerry to go to Paris in gesture of sympathy,REAL,Kerry to go to Paris in gesture of sympathy,"[kerry, to, go, to, paris, in, gesture, of, sy...","[kerry, go, paris, gesture, sympathy]",kerri go pari gestur sympathi
3,10142,Bernie supporters on Twitter erupt in anger ag...,FAKE,Bernie supporters on Twitter erupt in anger ag...,"[bernie, supporters, on, twitter, erupt, in, a...","[bernie, supporters, twitter, erupt, anger, dn...",berni support twitter erupt anger dnc tri warn
4,875,The Battle of New York: Why This Primary Matters,REAL,The Battle of New York Why This Primary Matters,"[the, battle, of, new, york, why, this, primar...","[battle, new, york, primary, matters]",battl new york primari matter


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

x = np.array(data['text_lemm'])
y = data['label']


#x = x.to_numpy(x)
longest_list = max(len(elem) for elem in x)
print(x)
print(longest_list)


['smell hillari fear'
 'watch exact moment paul ryan commit polit suicid trump ralli video'
 'kerri go pari gestur sympathi' ...
 'antitrump protest tool oligarchi inform'
 'ethiopia obama seek progress peac secur east africa'
 'jeb bush suddenli attack trump here matter']
194


In [12]:
from sklearn.preprocessing import LabelEncoder
scaler=LabelEncoder()
y=scaler.fit_transform(y)
y=np.array(y)
print(y)

[0 0 1 ... 0 1 1]


In [13]:
from tensorflow.keras.preprocessing.text import one_hot
voc_size = 5000

one_hot_rep = [one_hot(words, voc_size) for words in x]
print(one_hot_rep)

[[243, 132, 4371], [945, 2264, 4218, 2671, 888, 4935, 3899, 1458, 2125, 1416, 4280], [727, 3863, 4717, 2961, 3912], [3658, 3104, 3778, 1111, 3237, 353, 3694, 189], [1944, 4868, 364, 3303, 2292], [1080, 4600], [3157, 4145, 945, 2663, 2351, 917], [2390, 4973, 1397, 288], [2889, 4478, 2125, 4985, 3256, 2137], [554, 2455, 2866, 4868, 182, 3511, 2280, 1287, 1901], [1152, 4985, 3740, 1263, 4727, 1599, 132, 4985, 2615], [3446, 2125, 4741, 4597, 3387, 3167, 2522, 2261, 4219, 2484], [4323, 3748, 1918, 3992, 4983, 1646, 4183, 109, 4621, 4280], [3875, 1140, 3260, 675, 854, 1942, 3531], [2125, 2717, 2644, 1590], [2896, 2749, 1152], [1538, 807, 312, 132, 4966, 3974, 2288, 3146, 3905], [132, 4985, 2712, 4285, 3260, 192, 3913, 1425, 3838, 4915, 1902, 383, 109], [2898, 554, 3642, 312, 2383, 3995], [2519, 2784, 304, 2486, 537, 323, 4295, 3260], [1102, 906, 2125, 3313, 3200, 1762], [4361, 918, 2837, 2521, 962, 2701, 2801, 896, 2394, 4670], [132, 4985, 2866, 711, 146, 351, 4933], [4868, 1168, 4324, 1365,

In [14]:
from keras.preprocessing.sequence import pad_sequences
embedding = pad_sequences(one_hot_rep, padding = 'pre', maxlen = longest_list)
print(embedding)

[[   0    0    0 ...  243  132 4371]
 [   0    0    0 ... 2125 1416 4280]
 [   0    0    0 ... 4717 2961 3912]
 ...
 [   0    0    0 ... 3080  926 4060]
 [   0    0    0 ...  159  807 4760]
 [   0    0    0 ... 2125 4221 2292]]


In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense

embed_feature = 50
model = Sequential()
model.add(Embedding(voc_size, embed_feature, input_length = longest_list))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))

model.compile(loss= 'binary_crossentropy', optimizer = 'adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 194, 50)           250000    
                                                                 
 dropout (Dropout)           (None, 194, 50)           0         
                                                                 
 lstm (LSTM)                 (None, 100)               60400     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 310,501
Trainable params: 310,501
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(embedding, y, test_size=0.2)

In [17]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape


((5068, 194), (1267, 194), (5068,), (1267,))

In [18]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs = 10, batch_size= 64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f64642eb640>

In [27]:
y_pred = model.predict(x_test)
y_pred
print(type(y_pred))
print(type(y_test))
y_pred

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


array([[9.9879789e-01],
       [8.5485911e-01],
       [9.9252427e-01],
       ...,
       [2.4411082e-04],
       [9.9791121e-01],
       [4.6746248e-01]], dtype=float32)

In [28]:
x_test

array([[   0,    0,    0, ..., 1911,  310, 1207],
       [   0,    0,    0, ..., 1248, 3182, 4140],
       [   0,    0,    0, ..., 3041, 2143, 2437],
       ...,
       [   0,    0,    0, ..., 1923, 1431, 1539],
       [   0,    0,    0, ...,  888,  884, 1365],
       [   0,    0,    0, ...,  118, 2083, 2174]], dtype=int32)

In [30]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

#confusion_matrix(y_test, y_pred)
#accuracy_score(y_test, y_pred)
