In [2]:
from nbcode import Data, TrainingSet, TestingSet, ValidationSet
from gensim import corpora, models
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import pyLDAvis
import pyLDAvis.gensim

import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    
    

data = Data()

orig_kwds_sample = [i['kwds'] for i in data.initlist][0]
print("number of kwds: %s\n" % len(orig_kwds_sample), 
      "Keywords: \n", orig_kwds_sample)
s = data.keyword_cleaner(kwds=orig_kwds_sample)
print(s)



training_set = TrainingSet(data=data, max_size=0.6)

validation_set = ValidationSet(data=data, max_size=0.2, 
                               exclude_ids = training_set.ids)

testing_set = TestingSet(data=data, max_size=0.2, 
                         exclude_ids=training_set.ids.union(validation_set.ids))

blog_set = Data(lst=data.blog_entries)
news_set = Data(lst=data.news_entries)

print("Number of Blog Entries: ", len(blog_set.ids))
print("Number of News Entries: ", len(news_set.ids))


number of kwds: 162
 Keywords: 
 ['science', 'fiction', 'is', 'littered', 'with', 'examples', 'of', 'intelligent', 'computers', 'from', 'hal', '9000', 'in', '2001', 'space', 'odyssey', 'to', 'eddie', 'in', 'the', 'hitchhikers', 'guide', 'to', 'the', 'galaxy', 'one', 'thing', 'such', 'fictional', 'machines', 'have', 'in', 'common', 'is', 'tendency', 'to', 'go', 'wrong', 'to', 'the', 'detriment', 'of', 'the', 'characters', 'in', 'the', 'story', 'hal', 'murders', 'most', 'of', 'the', 'crew', 'of', 'mission', 'to', 'jupiter', 'eddie', 'obsesses', 'about', 'trivia', 'and', 'thus', 'puts', 'the', 'spacecraft', 'he', 'is', 'in', 'charge', 'of', 'in', 'danger', 'of', 'destruction', 'in', 'both', 'cases', 'an', 'attempt', 'to', 'build', 'something', 'useful', 'and', 'helpful', 'has', 'created', 'monster', ' ', 'successful', 'science', 'fiction', 'necessarily', 'plays', 'on', 'real', 'hopes', 'and', 'fears', 'in', 'the', '1960s', 'and', '1970s', 'when', 'hal', 'and', 'eddie', 'were', 'dreamed', 

In [2]:
init_notebook_mode(True)

pltdata = [go.Histogram(x=[k for k in data.all_keywords], histnorm='probability')]
pltlayout = go.Layout(title='Keyword Probabilities')
fig = go.Figure(data=pltdata, layout=pltlayout)
iplot(fig)

In [3]:
complete_corpus = []
blog_corpus = blog_set.keywords_as_docs
news_corpus = news_set.keywords_as_docs

complete_corpus.extend(training_set.keywords_as_docs)
complete_corpus.extend(testing_set.keywords_as_docs)
complete_corpus.extend(validation_set.keywords_as_docs)

d = corpora.Dictionary(complete_corpus)
d.compactify()

corpus_bows = [d.doc2bow(a) for a in complete_corpus]
blog_bows = [d.doc2bow(b) for b in blog_corpus]
news_bows = [d.doc2bow(n) for n in news_corpus]
training_bows = [d.doc2bow(t) for t in training_set.keywords_as_docs]
validation_bows = [d.doc2bow(v) for v in validation_set.keywords_as_docs]
testing_bows = [d.doc2bow(tt) for tt in testing_set.keywords_as_docs]


lda = models.ldamodel.LdaModel(corpus=corpus_bows, 
                               num_topics=10, alpha='auto', id2word = d)

for _ in lda.show_topics():
    print(_, end='\n\n')

(0, '0.005*"data" + 0.005*"presid" + 0.003*"steel" + 0.003*"vintag" + 0.003*"death" + 0.003*"tariff" + 0.003*"market" + 0.003*"trade" + 0.003*"hous" + 0.003*"plan"')

(1, '0.009*"month" + 0.007*"trump" + 0.006*"hous" + 0.006*"februari" + 0.006*"home" + 0.006*"report" + 0.006*"price" + 0.006*"januari" + 0.005*"school" + 0.005*"presid"')

(2, '0.009*"network" + 0.006*"random" + 0.005*"project" + 0.005*"neural" + 0.005*"matrix" + 0.005*"estim" + 0.005*"increas" + 0.005*"deep" + 0.005*"growth" + 0.004*"sens"')

(3, '0.008*"data" + 0.005*"time" + 0.005*"thing" + 0.005*"post" + 0.004*"node" + 0.004*"edg" + 0.004*"appear" + 0.004*"work" + 0.004*"make" + 0.003*"social"')

(4, '0.007*"compani" + 0.006*"peopl" + 0.004*"busi" + 0.004*"presid" + 0.003*"trump" + 0.003*"base" + 0.003*"work" + 0.003*"home" + 0.002*"citi" + 0.002*"countri"')

(5, '0.007*"appear" + 0.005*"post" + 0.004*"world" + 0.004*"state" + 0.004*"big" + 0.004*"scienc" + 0.003*"compani" + 0.003*"offic" + 0.003*"special" + 0.003*"mu

In [9]:
pyLDAvis.enable_notebook()

# Blog docs
pyLDAvis.gensim.prepare(lda, blog_bows, d)




In [10]:
# News docs
pyLDAvis.gensim.prepare(lda, news_bows, d)

In [11]:
import numpy as np

training_tops = [lda.get_document_topics(bow=t) for t in training_bows]
testing_tops = [lda.get_document_topics(bow=t) for t in testing_bows]
valid_tops = [lda.get_document_topics(bow=t) for t in validation_bows] 


def normalize_topic_features(setx):
    newset = []
    for tarr in setx:
        for top in range(10):
            if top in [t[0] for t in tarr]:
                pass
            else:
                tarr.insert(top, (top, 0.0))
        newset.append([t[1] for t in tarr])
    return np.vstack(newset)

trainingx = normalize_topic_features(setx=training_tops)
testingx = normalize_topic_features(setx=testing_tops)
validx = normalize_topic_features(setx=valid_tops)


trainy = [t["is_blog"] for t in training_set.entries]
testy = [ty['is_blog'] for ty in testing_set.entries]
validy = [tv['is_blog'] for tv in validation_set.entries]



10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
1

10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
1

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np

def fixed_response_vars(data):

    for t in range(len(data)):
        if data[t] in ['ture', True, 'True', 'true']:
            data.pop(t)
            data.insert(t, 1)
        elif data[t] in [False, 'fals', 'false', 'flase', 'False']:
            data.pop(t)
            data.insert(t, 0)
        elif data[t] is None:
            data.pop(t)
        else:
            print(type(data[t]), data[t])
    return data

trainy = fixed_response_vars(data=trainy)
testy = fixed_response_vars(data=testy)
validy = fixed_response_vars(data=validy)

logit = LogisticRegression(solver="sag", max_iter=50)
logit.fit(trainingx, trainy)
pred = logit.predict(testingx)
print(classification_report(y_true=testy, y_pred=pred,  digits=3))



             precision    recall  f1-score   support

          0      0.790     1.000     0.883       859
          1      0.000     0.000     0.000       228

avg / total      0.624     0.790     0.698      1087

