In [15]:
import os

import numpy as np
import pandas as pd

from src.config import config
from src.corpus.documents import load_documents
from src.dashboard.app import app
from src.dashboard.cache import cache
from src.dashboard.models import Collection, TopicModelLoader
from src.dashboard.plots.topics import prepare_topics
from src.dashboard.plots.analysis import prepare_analysis
from src.preprocessing.documents import filter_documents

num_topics, epochs = 6, 1
fn = 'v0-T{num_topics}-E{epochs}'.format(epochs=epochs, num_topics=num_topics)
model_path = os.path.join(config['DEFAULT']['project_path'], 'models', fn, 'model.pt')

model_loader = None
with app.app_context():
    for model_loader in TopicModelLoader.query.all():
        if (model_loader.model.num_epochs, model_loader.model.num_topics) == (1, 6):
            _ = model_loader.topics
            break

assert model_path == model_loader.path, 'Provided path does not match with the database. ' \
                                        'Found {}, Expected {}'.format(model_loader.path, model_path)


In [16]:
filtered_documents = filter_documents(docs=load_documents(verbose=True), verbose=True)
prepared_documents = model_loader.model.preprocess(filtered_documents)
prepared_topic_vis = prepare_topics(topic_model=model_loader.model, documents=prepared_documents)
document_topics = np.argmax(prepared_topic_vis['doc_topic_dists'], axis=1)
document_topics_df = pd.DataFrame([
    {'id': _id, 'text': doc.text, 'author': doc.author_id, 'topic': topic}
    for _id, (doc, topic) in enumerate(zip(filtered_documents, document_topics))
])

Loading Documents: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 880682/880682 [01:35<00:00, 9190.35it/s]
Extracting Unique Documents: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 670323/670323 [00:01<00:00, 434074.45it/s]


85590 duplicates removed from a total of 670323 documents.


Selecting Documents: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 82877/82877 [00:00<00:00, 1675808.75it/s]


In [52]:
count = 0
for proba, doc in zip(prepared_topic_vis['doc_topic_dists'], filtered_documents):
    if 'I believe that climate change is real' in doc.text:
        print(proba[0]+proba[1], doc.text)
    # count += 1
    # if count > 1000:
    #     break

0.6844071738644499 RT @ABCPolitics: "I believe that climate change is real, I believe that man has an impact on it," EPA chief nominee Andrew Wheeler tells Se…
