In [1]:
import data
import spacy
import numpy as np
from sklearn import manifold
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.matutils import sparse2full

nlp  = spacy.load('en_core_web_md')

In [100]:
def keep_token(t):
    return (t.is_alpha and 
            not (t.is_space or t.is_punct or 
                 t.is_stop or t.like_num))

def lemmatize_doc(doc):
    return [ t.lemma_ for t in doc if keep_token(t)]
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
def build_phrases(sentences):
    phrases = Phrases(sentences,
                      min_count=5,
                      threshold=7,
                      progress_per=1000)
    return Phraser(phrases)

In [101]:
phrases_model = build_phrases(docs)

In [102]:
phrases_model

<gensim.models.phrases.Phraser at 0x7fb94ab49d50>

In [56]:
auths = data.get_processed_data()
docs = [lemmatize_doc(nlp(" ".join(auth.clean))) for auth in list(auths.values())]

docs_dict = Dictionary(docs)
docs_dict.filter_extremes(no_below=20, no_above=0.2)
docs_dict.compactify()

docs_corpus = [docs_dict.doc2bow(doc) for doc in docs]
model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
docs_tfidf  = model_tfidf[docs_corpus]
docs_vecs   = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf])

tfidf_emb_vecs = np.vstack([nlp(docs_dict[i]).vector for i in range(len(docs_dict))])

docs_emb = np.dot(docs_vecs, tfidf_emb_vecs) 
xtrain = docs_emb[:200]
xtest = docs_emb[200:]

y = np.array([author.truth for author in list(auths.values())])
ytrain = y[:200]
ytest = y[200:]

In [80]:
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score

test_accs = [] ; train_accs = []
for i in range(2, 300):
    docs_pca = PCA(n_components=i).fit_transform(docs_emb)
    xtrain = docs_pca[:200]
    xtest = docs_pca[200:]

    tsne = manifold.TSNE()
    viz = tsne.fit_transform(docs_pca)

    init = np.array([xtrain[1], xtrain[0]])

    model = KMeans(n_clusters=2, algorithm="full", init=init, max_iter=9999, n_init=10)
    model.fit(xtrain)

    train_pred = model.predict(xtrain)
    train_acc = accuracy_score(ytrain, train_pred)
    train_accs += [train_acc]

    test_pred = model.predict(xtest)
    test_acc = accuracy_score(ytest, test_pred)
    test_accs += [test_acc]

    print(train_acc, test_acc)

    # fig, ax = plt.subplots()
    # ax.margins(0.05) 
    # zero_indices = np.where(y == 0)[0]
    # one_indices = np.where(y == 1)[0]
    # ax.plot(viz[zero_indices,0], viz[zero_indices,1], marker='o', linestyle='', 
    #         ms=8, alpha=0.3, label="Not Fake News")
    # ax.plot(viz[one_indices,0], viz[one_indices,1], marker='o', linestyle='', 
    #         ms=8, alpha=0.3, label="Fake News")
    # ax.legend()
    # plt.title(f"PCA {i}")
    # plt.show()

plt.figure(figsize=(20,20))
plt.plot(range(2, 300), train_accs, label="Training")
plt.plot(range(2, 300), test_accs, label="Testing")
plt.show()

0.34 0.39
0.34 0.39
0.345 0.4
0.34 0.41
0.34 0.4
0.34 0.4
0.335 0.42
0.335 0.4
0.335 0.4
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41
0.335 0.41


KeyboardInterrupt: 

In [83]:
docs_emb[0]

array([-3.86447102e-01, -1.72147676e-02,  1.28088027e-01,  7.33549297e-02,
        1.14122200e+00, -4.08551335e-01, -7.18038797e-01,  2.77640074e-01,
       -5.12234628e-01,  7.80458832e+00, -1.13108957e+00, -2.09667861e-01,
       -2.16553077e-01, -1.98100448e-01, -2.77872026e-01,  3.24287683e-01,
        1.21844001e-01,  4.38018703e+00,  6.97849214e-01, -7.82643795e-01,
        2.65942276e-01,  5.62826693e-01,  4.97382194e-01,  1.67652387e-02,
       -2.73004413e-01, -6.96716666e-01, -5.24541497e-01,  5.27035594e-01,
       -1.74810320e-01,  1.98621854e-01,  2.90647864e-01,  5.79948306e-01,
       -4.82353643e-02, -3.11240464e-01, -1.23873286e-01, -3.21687937e-01,
       -4.59722653e-02, -3.42020690e-01,  3.16998154e-01,  2.83069746e-03,
        3.44794542e-01, -4.23756428e-02,  1.00050342e+00, -9.98286068e-01,
       -2.85007153e-02, -9.48761165e-01,  5.01237094e-01, -3.10657382e-01,
        5.66955714e-04,  7.33617544e-02, -1.74489558e-01,  5.50555468e-01,
        9.42491516e-02,  