In [2]:
!pip install pandas
!pip install metapy
!pip install nltk
!pip install boto3



### Indexacion y recuperacion de informacion usando Meta

In [3]:
import pandas as pd
import metapy
import boto3

## Leer noticias desde S3

In [4]:
s3 = boto3.client('s3')
obj = s3.get_object(Bucket='finaltext', Key='news.csv')
df = pd.read_csv(obj['Body'])

In [5]:
df.shape

(142570, 10)

In [6]:
df.head()

Unnamed: 0,id,id_news,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [7]:
df.columns

Index(['id', 'id_news', 'title', 'publication', 'author', 'date', 'year',
       'month', 'url', 'content'],
      dtype='object')

### Construccion de la informacion de META

In [8]:
file1 = open("/home/hadoop/st1800eafit-final/data/news/news.dat","w")
for index, row in df.iterrows():
    #print(row['title'], row['content'], row['author'])
    texto = str(row['title']) + ' ' + str(row['content']) + ' ' + str(row['author'])
    texto = texto.replace("\n", " ")
    file1.writelines(texto+'\n')
file1.close()

### Ejecucion META

In [9]:
inv_idx = metapy.index.make_inverted_index('news.toml')

In [10]:
inv_idx.num_docs()

142570

In [11]:
inv_idx.unique_terms()

244408

In [12]:
inv_idx.avg_doc_length()

374.1612854003906

In [13]:
inv_idx.index_name()

'news-idx/inv'

In [14]:
inv_idx.total_corpus_terms()

53344177

### Inicializamos el Ranking BM25

In [15]:
ranker = metapy.index.OkapiBM25()

### Ejecutar query vs Indice Invertido

In [24]:
query = metapy.index.Document()
query.content("donald trump and clinton")

### Evaluacion de la busqueda

In [25]:
top_docs = ranker.score(inv_idx, query, num_results=5)
top_docs

[(93504, 7.9856061935424805),
 (15765, 7.939201831817627),
 (19269, 7.900861740112305),
 (20936, 7.895786762237549),
 (58411, 7.856106281280518)]

### Mostramos las noticas mas relevantes de la busqueda

In [26]:
for num, (d_id, _) in enumerate(top_docs):
    content = inv_idx.metadata(d_id).get('content')
    print("{}. {}...\n".format(num + 1, content[0:250]))

1. Super Tuesday’s winners and losers, state by state Voters head to the polls in 12 states, and one territory, for Super Tuesday.  Here are the projected winners, state by state. (For   coverage of the largest day during the primary and caucus season, ...

2. Rafael Cruz: Donald Trump ‘Would Be Worse Than Hillary Clinton, But He Cannot Beat Hillary Clinton’ - Breitbart Citing GOP frontrunner Donald Trump’s long history of supporting “ ” Democrats, Rafael Cruz, father of Ted Cruz, told Breitbart News Daily...

3. Hillary Clinton: Trump Will Lead America Into War with Thin Skin Hillary Clinton painted her Republican rival Donald Trump as a “dangerously incoherent” and unstable man, claiming that “It’s not hard to imagine Donald Trump leading us into a war just...

4. Donald Trump: Hillary Clinton ’Is owned by Wall Street’ - Breitbart Republican nominee Donald Trump is using Twitter to respond to Hillary Clinton’s speech at the Democratic National Convention. [Trump argues Clinton is “ow

# Topicos

In [19]:
fidx = metapy.index.make_forward_index('news.toml')

In [20]:
dset = metapy.classify.MulticlassDataset(fidx)

In [None]:
#model = metapy.topics.LDAParallelGibbs(docs=dset, num_topics=10, alpha=0.1, beta=0.1)
model = metapy.topics.LDACollapsedVB(dset, num_topics=2, alpha=1.0, beta=0.01)
model.run(num_iters=1000)
model.save('lda-news')

In [None]:
model = metapy.topics.TopicModel('lda-news')

In [None]:
for topic in range(0, model.num_topics()):
    print("Topic {}:".format(topic + 1))
    for tid, val in model.top_k(topic, 10, metapy.topics.BLTermScorer(model)):
        print("{}: {}".format(fidx.term_text(tid), val))
    print("======\n")

In [None]:
import pandas as pd

data = []
for doc in dset:
    proportions = model.topic_distribution(doc.id)
    data.append([dset.label(doc)] + [proportions.probability(i) for i in range(0, model.num_topics())])
df_topicos = pd.DataFrame(data, columns=['label'] + ["Topic {}".format(i + 1) for i in range(0, model.num_topics())])

In [None]:
df_topicos.head()

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

for i in range(0, model.num_topics()):
    print("Topic {}".format(i + 1))
    sns.swarmplot(data=df_topicos, x='label', y="Topic {}".format(i + 1))
    plt.show()

In [None]:
s3 = boto3.client('s3')
s3.download_file('finaltext', 'e-BS0ZP4B9V5EBU0ARR71Z4ECN8/Spark-Noteboook.ipynb', 'Spark-Noteboook.ipynb')
