# Imports

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.metrics import Precision, Recall, BinaryAccuracy
from tensorflow.keras.layers import Dense, Conv1D, Embedding, Flatten, Masking, Dropout, LSTM, GRU
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import string
from nltk.stem import WordNetLemmatizer as wn
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder as le
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [5]:
csv_path = '../raw_data'
lyricsdf = pd.read_csv(os.path.join(csv_path, 'song_lyrics.csv'))
recipesdf = pd.read_csv(os.path.join(csv_path, 'recipes.csv'))
fooddf = pd.read_csv(os.path.join(csv_path, 'food_reviews.csv'))
bookdf = pd.read_csv(os.path.join(csv_path, 'book_descriptions.csv'))
cleanfake = pd.read_csv(os.path.join(csv_path, 'clean_data_Fake.csv'))
cleantrue = pd.read_csv(os.path.join(csv_path, 'clean_data_True.csv'))

In [6]:
lyricsdf, recipesdf, fooddf, bookdf, cleanfake, cleantrue

(       Artist                                  Title  \
 0      Khalid                     Young Dumb & Broke   
 1      Khalid                               Location   
 2      Khalid                                 Better   
 3      Khalid                                   Talk   
 4      Khalid                                  Saved   
 ...       ...                                    ...   
 4163  Beyoncé  Check On It (Beyoncé Experience Live)   
 4164  Beyoncé     Ring The Alarm (Tranzformas Remix)   
 4165  Beyoncé                           Radio (Live)   
 4166  Beyoncé     Ring The Alarm (Karmatronic Remix)   
 4167  Beyoncé        Denial (Hold Up, Beyoncé, Poem)   
 
                             Album    Year        Date  \
 0                   American Teen  2017.0  2017-03-03   
 1                   American Teen  2016.0  2016-04-30   
 2                         Suncity  2018.0  2018-09-14   
 3                     Free Spirit  2019.0  2019-02-07   
 4                   Ame

# Variables

In [7]:
es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [8]:
binacc = BinaryAccuracy()
prec = Precision()
rec = Recall()

2022-08-29 15:21:04.538127: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-08-29 15:21:04.541346: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-08-29 15:21:04.543876: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (LAPTOP-0UJ0SON8): /proc/driver/nvidia/version does not exist
2022-08-29 15:21:04.562106: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
stop_words = stopwords.words('english')

# Functions

In [10]:
def lematize (words):
    for index, word in enumerate (words):
        words[index] = wn().lemmatize(word, pos='v')
    for index, word in enumerate (words):
        words[index] = wn().lemmatize(word, pos='r')
    for index, word in enumerate (words):
        words[index] = wn().lemmatize(word, pos='a')
    for index, word in enumerate (words):
        words[index] = wn().lemmatize(word, pos='n')
    for index, word in enumerate (words):
        words[index] = wn().lemmatize(word, pos='s')
    return ' '.join(words)

In [11]:
def preprocessing(sentence):
    sentence = sentence.strip()
    
    sentence = sentence.lower()
    
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, ' ') 
    
    sentence = sentence.strip()
    
    sentence = word_tokenize (sentence)
    final = []
    for word in sentence:
        if word in stop_words:
            sentence.remove(word)
        if len(word) >= 3:
            final.append(word)
            
    lematize(final)
            
    return ' '.join(final)

In [12]:
def plot_loss_accuracy(history, title):
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2, figsize=(15,10))
    
    ax1.plot(history.history['loss'])
    ax1.plot(history.history['val_loss'])
    ax1.set_title(f'{title} Loss')
    ax1.set_ylabel('Loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylim(ymin=0, ymax=2)
    ax1.legend(['Train', 'Validation'], loc='best')
    ax1.grid(axis="x",linewidth=0.5)
    ax1.grid(axis="y",linewidth=0.5)    
    
    ax2.plot(history.history['binary_accuracy'])
    ax2.plot(history.history['val_binary_accuracy'])
    ax2.set_title(f'{title} Accuracy')
    ax2.set_ylabel('Accuracy')
    ax2.set_xlabel('Epoch')
    ax2.set_ylim(ymin=0, ymax=2)
    ax2.legend(['Train', 'Validation'], loc='best')
    ax2.grid(axis="x",linewidth=0.5)
    ax2.grid(axis="y",linewidth=0.5)    

    ax3.plot(history.history['precision'])
    ax3.plot(history.history['val_precision'])
    ax3.set_title(f'{title} Precision')
    ax3.set_ylabel('Precision')
    ax3.set_xlabel('Epoch')
    ax3.set_ylim(ymin=0, ymax=2)
    ax3.legend(['Train', 'Validation'], loc='best')
    ax3.grid(axis="x",linewidth=0.5)
    ax3.grid(axis="y",linewidth=0.5)    
    
    ax4.plot(history.history['recall'])
    ax4.plot(history.history['val_recall'])
    ax4.set_title(f'{title} Recall')
    ax4.set_ylabel('Recall')
    ax4.set_xlabel('Epoch')
    ax4.set_ylim(ymin=0, ymax=2)
    ax4.legend(['Train', 'Validation'], loc='best')
    ax4.grid(axis="x",linewidth=0.5)
    ax4.grid(axis="y",linewidth=0.5)  
    
    plt.show()

In [13]:
def initialize_model (model_name, neurons):
    model = Sequential()
    model.add(Embedding(
        input_dim=(len(tokenizer.word_index))+1, # +1 for the 0 padding
        input_length=300, # Max_sentence_length (optional, for model summary)
        output_dim=100,
        mask_zero=True, # Built-in masking layer :)
    ))
    if model_name == 'lstm':
        model.add(LSTM((neurons*2), return_sequences=False))
        model.add(Dense((neurons), activation='relu'))
    if model_name == 'gru':
        model.add(GRU((neurons*2), return_sequences=False))
        model.add(Dense((neurons), activation='relu'))
    if model_name == 'cnn':
        model.add(Conv1D(neurons, kernel_size=5, activation='tanh'))
        model.add(Flatten())
        model.add(Dense((neurons/2), activation='relu'))
    #model.add(Dropout(rate=0.2))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [14]:
def compile_model(model):
    model.compile(loss='binary_crossentropy',
              optimizer=RMSprop(learning_rate=0.0001), 
              metrics=[binacc, prec, rec]) 
    return model

# Preparing data

In [15]:
food_proc = fooddf['Text'].apply(preprocessing)
food_proc

0        once saw peanut butter and cereal please that ...
1        great natural treat for cat whose health stron...
2        great treat each into piece for week puggle th...
3        this snack food and think this the taste cerea...
4        this buy this regularly back during thoroughly...
                               ...                        
27995    nothing website amazon com let know these be n...
27996    all these helper make but have dull ache all t...
27997    eat this every day and rather other pack becau...
27998    great coffee flavor even when dash milk strong...
27999    this nummy and bar with goodness the granola b...
Name: Text, Length: 28000, dtype: object

In [16]:
book_proc = bookdf['description'].apply(preprocessing)
book_proc

0        jeffery masson begin be about year two chick t...
1        this biography from american bar association v...
2        introduction sa statistical software foundatio...
3        do know that cornish pasty be tin miner from p...
4        interaction mosaic silver edition fully integr...
                               ...                        
27995    when break free from habit that world open let...
27996    the book present the and assumption behavioral...
27997    best sell book save the go movie blake snyder ...
27998    book two annihilate you read the volume before...
27999    strategy and collide fey history carrier us or...
Name: description, Length: 28000, dtype: object

In [17]:
recipes_proc = recipesdf['full'].apply(preprocessing)
recipes_proc

0        pan sear steak vegetable noodle soup garnish w...
1        spinach and soup this cold winter month serve ...
2        spinach and miso soup this simple soup full an...
3        corn dip and unlike anything you have once the...
4        sunday chicken rice bake warm and casserole wi...
                               ...                        
27995    spinach salad with spinach good for and waistl...
27996    slow bake tomato with and from home and websit...
27997    pumpkin harvest muffin the add moisture nice f...
27998    low fat blueberry muffin with these the muffin...
27999    homemade vegetable dip this always make when p...
Name: full, Length: 28000, dtype: object

In [18]:
lyrics_proc = lyricsdf['Lyric'].apply(preprocessing)
lyrics_proc

0       think just you not you you moment this just du...
1       send location let cause need the and through t...
2       well nothing baby nothing feel well not drink ...
3       can just can just talk about goin before get l...
4       the part always seem forever sometimes that ar...
                              ...                        
4163    ooh boy you like you what see win you over che...
4164    refrain ring the be this long but damn see ano...
4165    you the one that allow out with door close and...
4166    refrain ring the be this long but damn see ano...
4167    close more soft pretty le awake fast for wear ...
Name: Lyric, Length: 4168, dtype: object

In [33]:
notnewsdf = pd.DataFrame(pd.concat([book_proc.sample(13575), food_proc.sample(13575), recipes_proc.sample(13575), lyrics_proc], ignore_index=True), columns=['text'])
notnewsdf = notnewsdf.sample(n=len(notnewsdf), ignore_index=True)

In [34]:
notnewsdf['target'] = 0
notnewsdf

Unnamed: 0,text,target
0,roast butternut squash and pasta from light pe...,0
1,the lie more minute hard noodle other that noo...,0
2,when rain face and whole world your offer you ...,0
3,the novel twelve house from national bestselli...,0
4,grape jelly barbecue sauce and sauce you chick...,0
...,...,...
44888,easy crock pot turkey leg turkey leg taste gre...,0
44889,inside hot get something that can buy fight bi...,0
44890,chicken and dumpling not this yet but for reci...,0
44891,planner paper kate publish purple sloth,0


In [35]:
newsdf = pd.concat([cleanfake, cleantrue], ignore_index=True)
newsdf = newsdf.sample(n=len(newsdf), ignore_index=True)

In [36]:
newsdf['target'] = 1
newsdf

Unnamed: 0,text,target
0,see politically incorrect sign dairy queen own...,1
1,sander could endorse clinton white house bid s...,1
2,house tax chairman propose tweak tax cut bill ...,1
3,two fantastic tweet perfectly show main differ...,1
4,trump campaign california denounce protester r...,1
...,...,...
44893,trump right cnn very fake news federal judge r...,1
44894,germany merkel tell erdogan speed eu aid turke...,1
44895,senate reject immigration bill…trump call tota...,1
44896,must watch mark steyn call political violence ...,1


In [37]:
fulldf = pd.concat([newsdf, notnewsdf], ignore_index=True)
fulldf = fulldf.sample(n=len(fulldf), ignore_index=True)

In [38]:
fulldf

Unnamed: 0,text,target
0,american torch hypocrite gop lawmaker use affo...,1
1,british government extend northern ireland tal...,1
2,this title this may have imperfection such mis...,0
3,onion raspberry jalapeno chutney with zing enj...,0
4,peet major dickinson much all drink whenever g...,0
...,...,...
89786,word search puzzle jumbo edition fun and word ...,0
89787,kellyanne conway trump gracious prosecute clin...,1
89788,slow cook mexican shred beef this come from co...,0
89789,tweet trump definitely regret send probably aw...,1


# Testing model

In [39]:
X_train, X_test, y_train, y_test = train_test_split(fulldf['text'], fulldf['target'], test_size=0.25)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)
X_train_token = pad_sequences(X_train_token, dtype='int32', padding='post', maxlen=300)
X_test_token = pad_sequences(X_test_token, dtype='int32', padding='post', maxlen=300)

In [None]:
%%time
model = compile_model(initialize_model('cnn', 12))
model.summary()
history = model.fit(X_train_token, y_train, 
      epochs=30, 
      batch_size=32,
      validation_split=0.25,
      callbacks=[es],
    verbose = 1, 
    use_multiprocessing=True
      )
model.evaluate(X_test_token, y_test, verbose=1)
plot_loss_accuracy(history, 'cnn')

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 100)          12172100  
                                                                 
 conv1d (Conv1D)             (None, 296, 12)           6012      
                                                                 
 flatten (Flatten)           (None, 3552)              0         
                                                                 
 dense (Dense)               (None, 6)                 21318     
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 12,199,437
Trainable params: 12,199,437
Non-trainable params: 0
_________________________________________________________________
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30


# Conclusion: we're good to go!

# Saving model

In [None]:
model.save('../fake-news/models/notnewsfromnews.tf')