In [None]:
import pandas as pd
import nltk
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from gensim.models import word2vec

def clean_article(article, remove_stopwords = True):
    """Helper function to clean the reviews i.e. to convert a document to a sequence of words.
     Please note that we're not removing stopwords since word2vec relies on the broader context
     of the sentence in order to produce high-quality word vectors.

     Arg: review: review string (str)
          remove_stopwords: If true remove stopwords else not. (boolean)
     Returns: cleaned_review : Cleaned review (list)

     You should carry out the following steps.
     1. Remove HTML Tags.
     2. Remove non-letter characters.
     3. Convert to lower case.
    """
    ### Add your code here.
    article_text = BeautifulSoup(article).get_text()     
    letters_only = re.sub("[^a-zA-Z]", " ", article_text) 
    words = letters_only.lower().split()    
    if remove_stopwords:                         
      stops = set(stopwords.words("english"))                  
      words = [w for w in words if not w in stops]   
    cleaned_article = words

    #####################
    
    return cleaned_article

def article_to_sentences( article: str, tokenizer: nltk.tokenize.punkt.PunktSentenceTokenizer ):
    """Helper function to split a review into parsed sentences. Returns a 
     list of sentences, where each sentence is a list of words.

     Arg: review: review string (str)
          tokenizer: punkt tokenizer
     Returns:
          review_sentences: List of list of tokens.
                            e.g. [["word2vec", "was", "introduced", "by", "google" ],["it","leverages","distributed","token","representations"]]

     You should carry out the following steps.
     1. Use the tokenizer to split the paragraph into sentences.
     2. Clean the sentence to return a list of words for each sentence using the helper funtion above.
     3. Return a list of tokenized sentences.
    """
    ### Add your code here.
    sentences = tokenizer.tokenize(article)
    cleaned_sentences = []
    for sentence in sentences:
      cleaned_sentence = clean_article(sentence)
      cleaned_sentences.append(cleaned_sentence)
    article_sentences = cleaned_sentences
    ######################
    
    return article_sentences

nltk.download('punkt')
nltk.download('stopwords')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')
train['text'].dropna(inplace=True)
test['text'].dropna(inplace=True)

sentences = []
num_sentences_per_article = []
num_tokens_per_sentence = []
for article in train['text']:
    cleaned_article_sentences = article_to_sentences(article, tokenizer)
    num_sentences_per_article.append(len(cleaned_article_sentences))
    for sentence in cleaned_article_sentences:
        num_tokens_per_sentence.append(len(sentence))
        sentences.append(sentence)


trained_word2vec_model = word2vec.Word2Vec(sentences, workers=4, \
            size=100, min_count = 40, \
            window = 5, sample = 1e-3)

In [None]:
num_articles_train = len(train['text'])
num_reliable_train = len(train[(train['label']==0)])
num_fake_train = len(train[(train['label']==1)])
num_articles_test = len(test['text'])
num_words_vocab = len(trained_word2vec_model.wv.vocab)
average_sentences_per_article = sum(num_sentences_per_article)/len(num_sentences_per_article)
average_tokens_per_sentence = sum(num_tokens_per_sentence)/len(num_tokens_per_sentence)
top_10_words = trained_word2vec_model.wv.index2entity[:10]
top_10_word_frequencies = [trained_word2vec_model.wv.vocab[word].count for word in top_10_words]

In [16]:
top_10_word_frequencies

[80054, 66294, 56289, 38670, 37358, 36541, 30061, 27439, 26345, 25448]

In [None]:
def calculate_sparsity(train, w2v_model):
    vocab = w2v_model.wv.vocab
    distinct_word_count = 0
    for i,article in enumerate(train['text']):
        print(i)
        cleaned_article_sentences = article_to_sentences(article, tokenizer)
        words = set([item for sublist in cleaned_article_sentences for item in sublist])
        distinct_word_count += len(words)
    sparsity = 1 - (distinct_word_count/(len(train)*len(vocab)))
    return sparsity

sparsity = calculate_sparsity(train,trained_word2vec_model)


        

In [None]:
#Generate cluster visualization plot

from sklearn.cluster import KMeans
def clustering_on_wordvecs(word_vectors, num_clusters=10):
    # Initalize a k-means object and use it to extract centroids
    kmeans_clustering = KMeans(n_clusters = num_clusters, init='k-means++')
    idx = kmeans_clustering.fit_predict(word_vectors)
    
    return kmeans_clustering.cluster_centers_, idx

Z = trained_word2vec_model.wv.syn0;
centers, clusters = clustering_on_wordvecs(Z, 11);
centroid_map = dict(zip(trained_word2vec_model.wv.index2word, clusters));
center_words = []
for vec in centers:
    center_words.append(trained_word2vec_model.most_similar(positive=[vec], topn=1))

keys = [word[0][0] for word in center_words]
keys.remove('jxhnbinder')
embedding_clusters = []
word_clusters = []
for word in keys:
    embeddings = []
    words = []
    for similar_word, _ in trained_word2vec_model.most_similar(word, topn=5):
        words.append(similar_word)
        embeddings.append(trained_word2vec_model[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)


from sklearn.manifold import TSNE
import numpy as np

embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

import matplotlib.pyplot as plt
import matplotlib.cm as cm

def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=8)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.show()


tsne_plot_similar_words('Simiilarity clusters (n_closest=5) for cluster centers (n_clusters=10) from Kaggle Fake News Dataset', keys, embeddings_en_2d, word_clusters, 0.7,
                        'similar_words.png')



In [None]:
fake_train = train[(train['label']==1)]
real_train = train[(train['label']==0)]
fake_train['text'].dropna(inplace=True)
real_train['text'].dropna(inplace=True)

fake_sentences = []
real_sentences = []

for article in fake_train['text']:
    cleaned_article_sentences = article_to_sentences(article, tokenizer)
    for sentence in cleaned_article_sentences:
        fake_sentences.append(sentence)
        
for article in real_train['text']:
    cleaned_article_sentences = article_to_sentences(article, tokenizer)
    for sentence in cleaned_article_sentences:
        real_sentences.append(sentence)
        
fake_trained_word2vec_model = word2vec.Word2Vec(fake_sentences, workers=4, \
            size=100, min_count = 40, \
            window = 5, sample = 1e-3)
real_trained_word2vec_model = word2vec.Word2Vec(real_sentences, workers=4, \
            size=100, min_count = 40, \
            window = 5, sample = 1e-3)

top_10_fake_words = fake_trained_word2vec_model.wv.index2entity[:10]
top_10_real_words = real_trained_word2vec_model.wv.index2entity[:10]
top_10_fake_word_frequencies = [fake_trained_word2vec_model.wv.vocab[word].count for word in top_10_fake_words]
top_10_real_word_frequencies = [real_trained_word2vec_model.wv.vocab[word].count for word in top_10_real_words]


In [14]:
top_10_fake_word_frequencies

[20170, 19293, 16599, 16128, 14941, 14555, 12768, 11875, 10400, 10323]

In [None]:
fake_train['title'].dropna(inplace=True)
real_train['title'].dropna(inplace=True)

percent_words_capital_fake = []
percent_words_capital_real = []
for title in fake_train['title']:
    num_capital_words = 0
    num_words = 0
    words = title.split()
    for word in words:
        if word.isupper():
            num_capital_words += 1
        num_words += 1
    percent_words_capital_fake.append(num_capital_words/num_words)
for title in real_train['title']:
    num_capital_words = 0
    num_words = 0
    words = title.split()
    for word in words:
        if word.isupper():
            num_capital_words += 1
        num_words += 1
    percent_words_capital_real.append(num_capital_words/num_words)
        

In [None]:
average_percent_capital_fake = sum(percent_words_capital_fake)/len(percent_words_capital_fake)
average_percent_capital_real = sum(percent_words_capital_real)/len(percent_words_capital_real)
min_percent_capital_fake = min(percent_words_capital_fake)
min_percent_capital_real = min(percent_words_capital_real)
max_percent_capital_fake = max(percent_words_capital_fake)
max_percent_capital_real = max(percent_words_capital_real)

def get_percent_capital_boxplots():
    # Create a figure instance
    plt.title("Percent Capital Words for Titles")
    fig = plt.figure(1, figsize=(9, 6))

    # Create an axes instance
    ax = fig.add_subplot(111)
    ax.set_xticklabels(['Fake', 'Real'])

    # Create the boxplot
    bp = ax.boxplot([percent_words_capital_fake,percent_words_capital_real])

    # Save the figure
    fig.savefig('fig1.png', bbox_inches='tight')
    plt.savefig("percent_capital.png", format='png', dpi=150, bbox_inches='tight')

get_percent_capital_boxplots()

In [None]:
fake_train['title'].dropna(inplace=True)
real_train['title'].dropna(inplace=True)

title_length_fake = []
title_length_real = []
for title in fake_train['title']:
    words = title.split()
    title_length_fake.append(len(words))
for title in real_train['title']:
    words = title.split()
    title_length_real.append(len(words))

def get_title_length_boxplots():
    # Create a figure instance
    plt.title("Number of Words for Titles")
    fig = plt.figure(1, figsize=(9, 6))

    # Create an axes instance
    ax = fig.add_subplot(111)
    ax.set_xticklabels(['Fake', 'Real'])

    # Create the boxplot
    bp = ax.boxplot([title_length_fake,title_length_real])

    # Save the figure
    plt.savefig("title_length.png", format='png', dpi=150, bbox_inches='tight')

get_title_length_boxplots()