In [None]:
!pip install pandas
!pip install metapy
!pip install pyspark
!pip install nltk
!pip install boto3

In [1]:
import pandas as pd
import metapy
import boto3

In [2]:
s3 = boto3.client('s3')
obj = s3.get_object(Bucket='finaltext', Key='news.csv')
df = pd.read_csv(obj['Body'])

In [3]:
df.shape

(142570, 10)

In [4]:
df.head()

Unnamed: 0,id,id_news,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [5]:
df.columns

Index(['id', 'id_news', 'title', 'publication', 'author', 'date', 'year',
       'month', 'url', 'content'],
      dtype='object')

In [None]:
file1 = open("/home/hadoop/st1800eafit-final/data/news/news.dat","w")
for index, row in df.iterrows():
    #print(row['title'], row['content'], row['author'])
    texto = str(row['title']) + ' ' + str(row['content']) + ' ' + str(row['author'])
    texto = texto.replace("\n", " ")
    file1.writelines(texto+'\n')
file1.close()

In [6]:
inv_idx = metapy.index.make_inverted_index('news.toml')

In [7]:
inv_idx.num_docs()

142570

In [8]:
inv_idx.unique_terms()

244408

In [9]:
inv_idx.avg_doc_length()

374.1612854003906

In [10]:
inv_idx.index_name()

'news-idx/inv'

In [11]:
inv_idx.total_corpus_terms()

53344177

In [12]:
ranker = metapy.index.OkapiBM25()

In [13]:
query = metapy.index.Document()
query.content("donald trump and gardens")

In [14]:
top_docs = ranker.score(inv_idx, query, num_results=5)
top_docs

[(32614, 11.75703239440918),
 (34281, 11.522643089294434),
 (131604, 11.261302947998047),
 (12848, 10.96080493927002),
 (9336, 10.726383209228516)]

In [15]:
for num, (d_id, _) in enumerate(top_docs):
    content = inv_idx.metadata(d_id).get('content')
    print("{}. {}...\n".format(num + 1, content[0:250]))

1. Melania Trump visits Japanese garden with Akie Abe  Delray Beach, Florida (CNN) First Lady Melania Trump is committed to continuing the White House garden tradition, showcasing her interest in a Saturday morning visit to the Morikami Museum and Japan...

2. Melania Trump helps open garden dedicated to first ladies at DC children’s hospital  (CNN) First lady Melania Trump visited Children’s National Health System in Washington on Friday, her second visit there in as many months, to help the hospital mark...

3. How Trump can have his best first 100 days: Eat his veggies    George Ball is chairman and chief executive of the W. Atlee Burpee & Co. and a past president of the American Horticultural Society.               In 1933, Franklin Delano Roosevelt famou...

4. Donald Trump Celebrates House Vote for Obamacare Replacement: ’It’s Essentially Dead’ - Breitbart President Donald Trump celebrated the House vote to replace Obamacare on Thursday, bringing House Republicans to the White Ho

# Topicos

In [16]:
fidx = metapy.index.make_forward_index('news.toml')

In [17]:
dset = metapy.classify.MulticlassDataset(fidx)

In [None]:
model = metapy.topics.LDAParallelGibbs(docs=dset, num_topics=5, alpha=0.1, beta=0.1)
model.run(num_iters=100)
model.save('lda-news')

In [None]:
model = metapy.topics.TopicModel('lda-news')

In [None]:
for topic in range(0, model.num_topics()):
    print("Topic {}:".format(topic + 1))
    for tid, val in model.top_k(topic, 10, metapy.topics.BLTermScorer(model)):
        print("{}: {}".format(fidx.term_text(tid), val))
    print("======\n")

In [None]:
import pandas as pd

data = []
for doc in dset:
    proportions = model.topic_distribution(doc.id)
    data.append([dset.label(doc)] + [proportions.probability(i) for i in range(0, model.num_topics())])
df_topicos = pd.DataFrame(data, columns=['label'] + ["Topic {}".format(i + 1) for i in range(0, model.num_topics())])

In [None]:
df_topicos.head()

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

for i in range(0, model.num_topics()):
    print("Topic {}".format(i + 1))
    sns.swarmplot(data=df_topicos, x='label', y="Topic {}".format(i + 1))
    plt.show()

In [None]:
import pyspark
from pyspark.sql import SQLContext
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.clustering import LDA, LDAModel
import requests, zipfile, io, os
import nltk
import codecs
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

stop_words_nltk = set(stopwords.words('english'))

from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.ml.clustering import LDA, BisectingKMeans
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

from pyspark.sql.types import StringType

sc = SparkContext('local', "app-topic-detection") 
spark = SparkSession(sc)

In [20]:
s3 = boto3.client('s3')
s3.download_file('finaltext', 'e-BS0ZP4B9V5EBU0ARR71Z4ECN8/Spark-Noteboook.ipynb', 'Spark-Noteboook.ipynb')
