In [1]:
import pandas as pd
import metapy
import boto3

In [2]:
s3 = boto3.client('s3')
obj = s3.get_object(Bucket='bdnews', Key=u'news.csv')
df = pd.read_csv(obj['Body'])

In [3]:
df.shape

(142570, 10)

In [4]:
inv_idx = metapy.index.make_inverted_index('news.toml')

In [5]:
inv_idx.num_docs()

142570

In [6]:
inv_idx.unique_terms()

244408

In [7]:
inv_idx.avg_doc_length()

374.1612854003906

In [8]:
inv_idx.total_corpus_terms()

53344177

In [9]:
ranker = metapy.index.DirichletPrior()

In [10]:
query = metapy.index.Document()
query.content("donald trump and gardens")

In [11]:
top_docs = ranker.score(inv_idx, query, num_results=5)
top_docs

[(132408, 5.050427436828613),
 (34281, 4.792304515838623),
 (32614, 4.589043617248535),
 (4203, 4.517133712768555),
 (6136, 4.5021748542785645)]

In [12]:
for num, (d_id, _) in enumerate(top_docs):
    content = inv_idx.metadata(d_id).get('content')
    print("{}. {}...\n".format(num + 1, content[0:250]))

1. One part of the Obama White House that will endure under Trump: Michelle’s vegetable garden      It was less than a year ago that Michelle Obama referred to it as “her baby. ” She wasn’t talking about her youngest daughter, Sasha, or the Obama’s pet ...

2. Melania Trump helps open garden dedicated to first ladies at DC children’s hospital  (CNN) First lady Melania Trump visited Children’s National Health System in Washington on Friday, her second visit there in as many months, to help the hospital mark...

3. Melania Trump visits Japanese garden with Akie Abe  Delray Beach, Florida (CNN) First Lady Melania Trump is committed to continuing the White House garden tradition, showcasing her interest in a Saturday morning visit to the Morikami Museum and Japan...

4. How I Created My Very First Garden From Scratch - The New York Times MILL VALLEY, Calif.  —   “Remind me again why we bought this house?” my husband asked. He was standing on the front porch a few days after we finally move

# Topicos

In [13]:
fidx = metapy.index.make_forward_index('news.toml')

In [None]:
dset = metapy.classify.MulticlassDataset(fidx)

In [None]:
model = metapy.topics.LDAParallelGibbs(docs=dset, num_topics=10, alpha=0.1, beta=0.1)
model.run(num_iters=10)
model.save('lda-news')

In [None]:
model = metapy.topics.TopicModel('lda-news')

In [None]:
for topic in range(0, model.num_topics()):
    print("Topic {}:".format(topic + 1))
    for tid, val in model.top_k(topic, 10, metapy.topics.BLTermScorer(model)):
        print("{}: {}".format(fidx.term_text(tid), val))
    print("======\n")

In [None]:
import pandas as pd

data = []
for doc in dset:
    proportions = model.topic_distribution(doc.id)
    data.append([dset.label(doc)] + [proportions.probability(i) for i in range(0, model.num_topics())])
df = pd.DataFrame(data, columns=['label'] + ["Topic {}".format(i + 1) for i in range(0, model.num_topics())])

In [None]:
df.head()

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

for i in range(0, model.num_topics()):
    print("Topic {}".format(i + 1))
    sns.swarmplot(data=df, x='label', y="Topic {}".format(i + 1))
    plt.show()

In [None]:
file1 = open("/home/hadoop/st1800eafit-final/data/news/news.dat","w")
for index, row in df.iterrows():
    #print(row['title'], row['content'], row['author'])
    texto = str(row['title']) + ' ' + str(row['content']) + ' ' + str(row['author'])
    texto = texto.replace("\n", " ")
    file1.writelines(texto+'\n')
file1.close()

In [None]:
import os
os.getcwd()