In [1]:
import string
import re
import spacy 
import nltk
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
import tensorflow as tf 
from nltk.stem import WordNetLemmatizer
from nltk import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')

#To Temporarily Remove Deprecation Warning
import warnings
#warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [1]:
data = pd.read_csv('../data/combined_news.csv')
data.dropna(inplace=True)
data.info()

NameError: name 'pd' is not defined

In [None]:
sns.countplot(data['label'])

In [None]:
data['length'] = data['text'].apply(lambda x: len(x.split(' ')))
sns.distplot(data['length'])
data['length'].describe()

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
# main tokenizing function
def tokenize_line(line, chars_to_exclude, stopwords, tokenizer, stem_tokens=False, stemmer=None, lemm_tokens=False, lemmatizer=None):
    # removing unwanted characters and numbers from the string
    pattern = '[' + '|'.join(list(chars_to_exclude)) + '|\d]*'
    line = re.sub(pattern, "", line).lower()

    # generating tokens
    tokens = [token for token in tokenizer.tokenize(line) if token not in stopwords]
    
    # stemming the tokens if the user wants to
    if stem_tokens:
        tokens = [stemmer.stem(token) for token in tokens]

    # lemmatizing the tokens if the user wants to
    if lemm_tokens:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

In [None]:
# function to help with tokenizing columns
def tokenize_lines(lines, chars_to_exclude, stopwords, tokenizer=None, stem_tokens=False, stemmer=None, lemm_tokens=False, lemmatizer=None):

    all_tokens = [tokenize_line(line, chars_to_exclude, stopwords, tokenizer, stem_tokens, stemmer, lemm_tokens, lemmatizer) for line in lines]

    return np.array(all_tokens)

In [None]:
# the default parameters for now
def get_default_tokenization_params():
    chars_to_exclude = string.punctuation
    english_stopwords = set(stopwords.words('english'))
    tokenizer = RegexpTokenizer("\w+")

    #Modify these according to the user's choices
    stem_tokens = False
    stemmer = PorterStemmer()

    lemm_tokens = True
    lemmatizer = WordNetLemmatizer()

    return (chars_to_exclude, english_stopwords, tokenizer, stem_tokens, stemmer, lemm_tokens, lemmatizer)

In [None]:

#To-Do: Brush up the following code to highlight the top topics discussed in the data sets: 
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline
import pyLDAvis.sklearn

X = tokenize_lines(data['text'], *get_default_tokenization_params())
Y = data['label']
data_processed = pd.DataFrame(({'title': data['title'], 'text': X, 'label': Y}))


# Subsequent section attempts to perform some Topic Modeling using LDA 

# This section focuses on real news
real_news = data_processed[data_processed['label'] == 'Real']
num_topics = 10 
num_features = 100
'''
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, stop_words='english')
lda = LatentDirichletAllocation(n_components=num_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0)
lda_pipeline = Pipeline([('vectorizer', vectorizer), ('lda', lda)])

pyLDAvis.enable_notebook()
data_vectorized = vectorizer.fit_transform(data_processed['text'])
dash = pyLDAvis.sklearn.prepare(lda_pipeline.steps[1][1], data_vectorized, vectorizer, mds='tsne')
pyLDAvis.save_html(dash, 'real_news_lda.html')

# Repeat the above for fake news

'''


In [None]:
realtext= real_news['text']

In [None]:
#Gensim library for additional topic modeling and statistical anlysis
#corpora module implements dictionary-mapping btwn words and corresponding int IDs
from gensim import corpora

real_gensim_dictionary = corpora.Dictionary(realtext)
#creating bag of words (bow) w/ realtext
real_gensim_corpus = [real_gensim_dictionary.doc2bow(token, allow_update=True) for token in realtext]

In [None]:
import pickle
#saving dictionary and bow (via pickle )to use for predictions below
filename1 = 'real_gensim_corpus.pkl'
pickle.dump(real_gensim_corpus, open(filename1, 'wb'))
real_gensim_dictionary.save('real_gensim_dictionary.gensim')

In [None]:
import gensim
#creating LDA Model
#takes v long to load
real_lda_model = gensim.models.ldamodel.LdaModel(real_gensim_corpus, num_topics=num_topics, id2word=real_gensim_dictionary, passes=10)
real_lda_model.save('real_gensim_model.gensim')

In [None]:
#printing 5 words per topic
topics = real_lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

In [None]:
#Evaluating LDA: Topic Coherence
#Perplexitiy: -8.169369315206076 (low)
#Coherence: 0.4601994449293338 (high)
#Perplexity is low and coherence is high which is expected as this is real news(?)

from gensim.models import CoherenceModel

coherence_score_lda = CoherenceModel(model=real_lda_model, texts=realtext, dictionary=real_gensim_dictionary, coherence='c_v')
coherence_score = coherence_score_lda.get_coherence()

print('\nPerplexity:', real_lda_model.log_perplexity(real_gensim_corpus))
print('\nCoherence Score:', coherence_score)

In [None]:
#Topic Modeling Visualization

real_gensim_dictionary = gensim.corpora.Dictionary.load('real_gensim_dictionary.gensim')
real_gensim_corpus = pickle.load(open(filename1, 'rb'))
real_lda_model = gensim.models.ldamodel.LdaModel.load('real_gensim_model.gensim')

import pyLDAvis.gensim_models

lda_visualization = pyLDAvis.gensim_models.prepare(real_lda_model, real_gensim_corpus, real_gensim_dictionary, sort_topics=False)
pyLDAvis.enable_notebook()
pyLDAvis.display(lda_visualization)

In [None]:
#Same as above but for fake news
fake_news = data_processed[data_processed['label'] == 'Fake']
faketext = fake_news['text']

In [None]:
fake_gensim_dictionary = corpora.Dictionary(faketext)
#creating bag of words (bow) w/ realtext
fake_gensim_corpus = [fake_gensim_dictionary.doc2bow(token, allow_update=True) for token in faketext]

#pickling useful for efficiently storing data to be used later
filename2 = 'fake_gensim_corpus.pkl'
pickle.dump(fake_gensim_corpus, open(filename2, 'wb'))
fake_gensim_dictionary.save('fake_gensim_dictionary.gensim')

In [None]:
fake_lda_model = gensim.models.ldamodel.LdaModel(fake_gensim_corpus, num_topics=num_topics, id2word=fake_gensim_dictionary, passes=10)
fake_lda_model.save('fake_gensim_model.gensim')

In [None]:
"""topics = fake_lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)"""

In [None]:
coherence_score_lda = CoherenceModel(model=fake_lda_model, texts= faketext, dictionary=fake_gensim_dictionary, coherence='c_v')
coherence_score = coherence_score_lda.get_coherence()

print('\nPerplexity:', fake_lda_model.log_perplexity(fake_gensim_corpus))
print('\nCoherence Score:', coherence_score)

In [None]:
"""fake_gensim_dictionary = gensim.corpora.Dictionary.load('fake_gensim_dictionary.gensim')
fake_gensim_corpus = pickle.load(open(filename2, 'rb'))
fake_lda_model = gensim.models.ldamodel.LdaModel.load('fake_gensim_model.gensim')

import pyLDAvis.gensim_models

lda_visualization = pyLDAvis.gensim_models.prepare(fake_lda_model, fake_gensim_corpus, fake_gensim_dictionary, sort_topics=False)
pyLDAvis.enable_notebook()
pyLDAvis.display(lda_visualization)"""

In [None]:
'''
Defining and Training the Model: 
There will be several different types of sequential operations and layers: 

1. A tokenizer to transform each article into a vector of tokens
2. A word embedding layer that learns an embedding vector. 
3. A 1D convolutional and max-pooling layer -- this is to calculate the largest value in each feature map
4. LSTM (Long Short-Term Memory) units: this will form the recurrent part ofn the recurrent convolutional neural network. 
'''

# First step is to import all of the necessary libraries for this experiment

from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import tokenizer
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Dropout
# Checking tensorflow version
if float(tf.__version__[0]) < 2.0:
    print("Updating Tensorflow")
    !pip install --upgrade tensorflow
else: 
    print("Correct Version of Tensorflow installed")