# Create and Index Signal dataset

In [3]:
import json
import os
import sys
import time

from elasticsearch import Elasticsearch, RequestError

In [4]:
es = Elasticsearch(['http://localhost/'], 
                    #http_auth=('elastic', 'bm25p'),
                    port=9200,
                    timeout=30
                    )
es

### Very important
https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules-similarity.html

the default for Elastic is BM25

### Create and delete index

In [58]:
# es.indices.create("articles", ignore=400)

{'acknowledged': True, 'index': 'articles', 'shards_acknowledged': True}

In [57]:
# es.indices.delete(index='articles', ignore=[400, 404])

{'acknowledged': True}

In [48]:
es.count(index='articles')

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'count': 1000000}

In [None]:
def index_from_path(es, inputFile, indexName):
    """
    We index tweet from file

    :param es: the ES instance
    :return:

    Possible errors:
    - elasticsearch.exceptions.RequestError: TransportError(400, u'mapper_parsing_exception',
    u'failed to parse [bounding_box]')

    - elasticsearch.exceptions.ConnectionTimeout: ConnectionTimeout caused by -
    ReadTimeoutError(HTTPConnectionPool(host=u'localhost', port=9200): Read timed out. (read timeout=10))

    - elasticsearch.exceptions.ConnectionError: ConnectionError(('Connection aborted.', error(104,
    'Connection reset by peer'))) caused by: ProtocolError(('Connection aborted.',
    error(104, 'Connection reset by peer')))
    """

    i = 0
    numIndex = 0
    with open(inputFile, "r") as f:
        for line in f:
            i += 1
            if i % 100000 == 0:
                print("Processed tweets: ", i)
                print("Indexed tweets: ", numIndex)

            try:
                article_dict = json.loads(line)
                # added request_timeout to avoid elasticsearch.exceptions.ConnectionTimeout
                es.index(index="articles", doc_type='article', id=i, body=article_dict, 
                         request_timeout=30)
                numIndex += 1
            except RequestError as e:
                print("Couldn't index article id: ", i)
                print(e.status_code, e.message)
                time.sleep(60)

    print("Processed tweets: ", i)
    print("Indexed tweets: ", numIndex)

In [62]:
# index_from_path(es, "signalmedia-1m.jsonl", "articles")

In [7]:
es.count(index='articles',doc_type='article')

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'count': 793111}

In [29]:
res = es.search(index="articles", 
                doc_type='article', 
                size=10, 
                body={"query": {
                        "match": {
                                "content": "Alex Wagner"
                        }}})



# [x['_source'] for x in res['hits']['hits']]
for r in res['hits']['hits'][:3]:
    print(r)
    print()

{'_index': 'articles', '_type': 'article', '_id': '207', '_score': 17.732721, '_source': {'title': 'Your picks for Kanye’s 2020 campaign slogan', 'media-type': 'Blog', 'content': 'After declaring he’s running for the White House – in 2020 that is – we asked for your #Kanye2020Slogan ideas, and you really came through. Alex Wagner shares the best of the best. \n\nRead more', 'source': 'msnbc.com Latest Headlines', 'published': '2015-09-01T02:31:03Z', 'id': '036ae957-779c-4bfd-a7b7-cf9a182baaa3'}}

{'_index': 'articles', '_type': 'article', '_id': '97471', '_score': 17.554064, '_source': {'title': 'Rice rolls over Wagner 56-16', 'media-type': 'News', 'content': "Darik Dillard scores twice, Rice rolls over Wagner   HOUSTON -  \nDarik Dillard ran for two touchdowns and Rice rolled over Wagner 56-16 in the opener for both teams on Saturday in a game that was delayed at the start by foul weather. \n\nDillard finished with 93 yards on 15 carries and Austin Walter carried 12 times for 107 yard

In [45]:
print(res["took"]) 
print(res["hits"]["total"])
print(res["hits"]["max_score"])
print(len(res['hits']['hits']))
print(res['hits']['hits'][2])

8
11411
17.732721
10
{'_index': 'articles', '_type': 'article', '_id': '228457', '_score': 17.034681, '_source': {'title': 'M. Tennis. Green Wave Post Successful Day One at Rice Invitational', 'media-type': 'News', 'content': "Sept. 18, 2015 \n\nRice Invitational Day 1 Results HOUSTON - While Tulane men's tennis senior Dominik Koepfer was advancing to the quarterfinals of the Oracle/ITA Master's Tournament, the rest of the Green Wave squad were making splashes of their own in day one action of the Rice Invitational on Friday. The day was highlighted by the Olive and Blue posting a perfect 7-0 mark in singles action in Draws A-D. In Draw A, Sebastian Rey cruised to a 6-3, 6-1 win over Texas' Nick Naumann and will now face Portland's Mathieu Garcia on Saturday. Constantin Schmitz joined Rey on the winner's side of the bracket after bouncing back with a 1-6, 7-5, 6-2 win over Portland's Mike Pervolaraskis, advancing to face Incarnate Word's Josip Smoljan. \n\nDraw B witnessed both Chi-Sha

In [46]:
retrieved_ordered =[res['hits']['hits'][i]["_source"]["id"] for i in range(len(res['hits']['hits']))]

In [47]:
retrieved_ordered.index("3f78107e-be93-4f7e-b24c-3f6bda9ad7f8")

2