In [3]:
!pip install pandas
!pip install metapy
!pip install pyspark
!pip install nltk
!pip install boto3

Collecting boto3
[?25l  Downloading https://files.pythonhosted.org/packages/a6/1f/b272ead5ccc5370717f3c65ebd5092feab90e748db041bd96c565e7d1a72/boto3-1.9.169-py2.py3-none-any.whl (128kB)
[K     |████████████████████████████████| 133kB 25.6MB/s eta 0:00:01
[?25hCollecting s3transfer<0.3.0,>=0.2.0 (from boto3)
[?25l  Downloading https://files.pythonhosted.org/packages/16/8a/1fc3dba0c4923c2a76e1ff0d52b305c44606da63f718d14d3231e21c51b0/s3transfer-0.2.1-py2.py3-none-any.whl (70kB)
[K     |████████████████████████████████| 71kB 21.9MB/s eta 0:00:01
[?25hCollecting botocore<1.13.0,>=1.12.169 (from boto3)
[?25l  Downloading https://files.pythonhosted.org/packages/28/ac/a43d37f371f5854514128d7c54887176b8df3bc9925a25e5096298033f93/botocore-1.12.169-py2.py3-none-any.whl (5.5MB)
[K     |████████████████████████████████| 5.6MB 22.9MB/s eta 0:00:01     |████████████████▎               | 2.8MB 22.9MB/s eta 0:00:01
[?25hCollecting jmespath<1.0.0,>=0.7.1 (from boto3)
  Downloading https://files

In [4]:
import pandas as pd
import metapy
import boto3

In [6]:
s3 = boto3.client('s3')
obj = s3.get_object(Bucket='finaltext', Key='news.csv')
df = pd.read_csv(obj['Body'])

In [8]:
df.shape

(142570, 10)

In [9]:
df.head()

Unnamed: 0,id,id_news,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [10]:
df.columns

Index(['id', 'id_news', 'title', 'publication', 'author', 'date', 'year',
       'month', 'url', 'content'],
      dtype='object')

In [11]:
inv_idx = metapy.index.make_inverted_index('news.toml')

In [12]:
inv_idx.num_docs()

142570

In [14]:
inv_idx.unique_terms()

244408

In [15]:
inv_idx.avg_doc_length()

374.1612854003906

In [16]:
inv_idx.index_name()

'news-idx/inv'

In [17]:
inv_idx.total_corpus_terms()

53344177

In [18]:
ranker = metapy.index.OkapiBM25()

In [19]:
query = metapy.index.Document()
query.content("donald trump and gardens")

In [20]:
top_docs = ranker.score(inv_idx, query, num_results=5)
top_docs

[(32614, 11.75703239440918),
 (34281, 11.522643089294434),
 (131604, 11.261302947998047),
 (12848, 10.96080493927002),
 (9336, 10.726383209228516)]

In [21]:
for num, (d_id, _) in enumerate(top_docs):
    content = inv_idx.metadata(d_id).get('content')
    print("{}. {}...\n".format(num + 1, content[0:250]))

1. Melania Trump visits Japanese garden with Akie Abe  Delray Beach, Florida (CNN) First Lady Melania Trump is committed to continuing the White House garden tradition, showcasing her interest in a Saturday morning visit to the Morikami Museum and Japan...

2. Melania Trump helps open garden dedicated to first ladies at DC children’s hospital  (CNN) First lady Melania Trump visited Children’s National Health System in Washington on Friday, her second visit there in as many months, to help the hospital mark...

3. How Trump can have his best first 100 days: Eat his veggies    George Ball is chairman and chief executive of the W. Atlee Burpee & Co. and a past president of the American Horticultural Society.               In 1933, Franklin Delano Roosevelt famou...

4. Donald Trump Celebrates House Vote for Obamacare Replacement: ’It’s Essentially Dead’ - Breitbart President Donald Trump celebrated the House vote to replace Obamacare on Thursday, bringing House Republicans to the White Ho

# Topicos

In [22]:
fidx = metapy.index.make_forward_index('news.toml')

In [23]:
dset = metapy.classify.MulticlassDataset(fidx)

In [None]:
model = metapy.topics.LDAParallelGibbs(docs=dset, num_topics=5, alpha=0.1, beta=0.1)
model.run(num_iters=100)
model.save('lda-news')

In [None]:
model = metapy.topics.TopicModel('lda-news')

In [None]:
for topic in range(0, model.num_topics()):
    print("Topic {}:".format(topic + 1))
    for tid, val in model.top_k(topic, 10, metapy.topics.BLTermScorer(model)):
        print("{}: {}".format(fidx.term_text(tid), val))
    print("======\n")

In [None]:
import pandas as pd

data = []
for doc in dset:
    proportions = model.topic_distribution(doc.id)
    data.append([dset.label(doc)] + [proportions.probability(i) for i in range(0, model.num_topics())])
df_topicos = pd.DataFrame(data, columns=['label'] + ["Topic {}".format(i + 1) for i in range(0, model.num_topics())])

In [None]:
df_topicos.head()

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

for i in range(0, model.num_topics()):
    print("Topic {}".format(i + 1))
    sns.swarmplot(data=df_topicos, x='label', y="Topic {}".format(i + 1))
    plt.show()

In [7]:
file1 = open("/home/hadoop/st1800eafit-final/data/news/news.dat","w")
for index, row in df.iterrows():
    #print(row['title'], row['content'], row['author'])
    texto = str(row['title']) + ' ' + str(row['content']) + ' ' + str(row['author'])
    texto = texto.replace("\n", " ")
    file1.writelines(texto+'\n')
file1.close()

In [None]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("yarn-client")
sc = SparkContext()
print(sc.version)

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [None]:
data = sqlContext.read.format("csv")

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('app-sentimientos').getOrCreate()

In [None]:
df.to_csv("/home/hadoop/news.csv")

In [None]:
os.getcwd()

In [None]:
# importing some libraries
import pandas as pd
import pyspark
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
# stuff we'll need for text processing
from nltk.corpus import stopwords
import re as re
from pyspark.ml.feature import CountVectorizer , IDF
# stuff we'll need for building the model

from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.clustering import LDA, LDAModel
# reading the data
data = sqlContext.read.format("csv") \
   .options(header='true', inferschema='true') \
   .load(os.path.realpath("/home/hadoop/news.csv"))

In [None]:
reviews = data.rdd.map(lambda x : x['content']).filter(lambda x: x is not None)

In [None]:
reviews = data.rdd.map(lambda x : x['content']).filter(lambda x: x is not None)
StopWords = stopwords.words("english")
tokens = reviews                                                   \
    .map( lambda document: document.strip().lower())               \
    .map( lambda document: re.split(" ", document))          \
    .map( lambda word: [x for x in word if x.isalpha()])           \
    .map( lambda word: [x for x in word if len(x) > 3] )           \
    .map( lambda word: [x for x in word if x not in StopWords])    \
    .zipWithIndex()

In [None]:
df_txts = sqlContext.createDataFrame(tokens, ["list_of_words",'index'])
# TF
cv = CountVectorizer(inputCol="list_of_words", outputCol="raw_features", vocabSize=5000, minDF=10.0)
cvmodel = cv.fit(df_txts)
result_cv = cvmodel.transform(df_txts)
# IDF
idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv) 

In [None]:


lda_model = LDA.train(result_tfidf[['index','features']], k=num_topics, maxIterations=max_iterations)

In [None]:
num_topics = 10
max_iterations = 100
r
lda_model = LDA.train(result_tfidf[['index','features']].map(list), k=num_topics, maxIterations=max_iterations)