# Create and Index Signal dataset

In [2]:
import json
import os
import sys
import time

from elasticsearch import Elasticsearch, RequestError

In [3]:
es = Elasticsearch(['http://localhost/'], 
                    #http_auth=('elastic', 'bm25p'),
                    port=9200,
                    timeout=30
                    )
es

<Elasticsearch([{'host': 'localhost'}])>

### Very important
https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules-similarity.html

the default for Elastic is BM25

### Create and delete index

In [3]:
# es.indices.create("articles", ignore=400)

In [4]:
# es.indices.delete(index='articles', ignore=[400, 404])

In [4]:
es.count(index='rcv1')

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'count': 804414}

In [9]:
def index_from_path(es, inputFile, indexName):
    """
    We index tweet from file

    :param es: the ES instance
    :return:

    Possible errors:
    - elasticsearch.exceptions.RequestError: TransportError(400, u'mapper_parsing_exception',
    u'failed to parse [bounding_box]')

    - elasticsearch.exceptions.ConnectionTimeout: ConnectionTimeout caused by -
    ReadTimeoutError(HTTPConnectionPool(host=u'localhost', port=9200): Read timed out. (read timeout=10))

    - elasticsearch.exceptions.ConnectionError: ConnectionError(('Connection aborted.', error(104,
    'Connection reset by peer'))) caused by: ProtocolError(('Connection aborted.',
    error(104, 'Connection reset by peer')))
    """

    i = 0
    numIndex = 0
    with open(inputFile, "r") as f:
        for line in f:
            i += 1
            if i % 100000 == 0:
                print("Processed tweets: ", i)
                print("Indexed tweets: ", numIndex)

            try:
                article_dict = json.loads(line)
                # added request_timeout to avoid elasticsearch.exceptions.ConnectionTimeout
                es.index(index="articles", doc_type='article', id=i, body=article_dict, 
                         request_timeout=30)
                numIndex += 1
            except RequestError as e:
                print("Couldn't index article id: ", i)
                print(e.status_code, e.message)
                time.sleep(60)

    print("Processed tweets: ", i)
    print("Indexed tweets: ", numIndex)

In [62]:
# index_from_path(es, "signalmedia-1m.jsonl", "articles")

In [10]:
es.count(index='rcv1',doc_type='article')

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'count': 1400}

In [6]:
res = es.search(index="rcv1", 
                doc_type='article', 
                size=10, 
                body={"query": {
                        "match": {
                                "content": "stock"
                        }}})



# [x['_source'] for x in res['hits']['hits']]
for r in res['hits']['hits'][:3]:
    print(r)
    print()
    
    
retrieved_ordered =[res['hits']['hits'][i]["_source"]["id"] for i in range(len(res['hits']['hits'][:5]))]
print(type(retrieved_ordered[0]))

{'_index': 'rcv1', '_type': 'article', '_id': '35085', '_score': 4.2212243, '_source': {'id': '38127', 'date': '1996-09-06', 'lang': 'en', 'title': 'USA: Greif Bros Q3 Class A, B shares fall.', 'headline': 'Greif Bros Q3 Class A, B shares fall.', 'content': 'NINE MONTHS ENDED JULY 31,\n\t\t\t\t\t\t\t1996\t    1995\n Net Sales\t\t\t\t$474,949,000   $539,086,000\n Net Income\t\t\t\t 27,041,000     47,847,000\nAverage Shares:\n   Class A Common Stock\t\t 10,873,172     10,873,172\n   Class B Common Stock\t\t 12,028,460     13,268,883\nNet Income Per Share:\n   Class A Common Stock\t\t\t 1.00\t     1.89\n   Class B Common Stock\t\t\t 1.15\t     2.05\n\t\t\t\t\t   THREE MONTHS ENDED JULY 31,\n\t\t\t\t\t\t\t1996\t     1995\n Net Sales\t\t\t\t  $155,994,000 $184,159,000\n Net Income\t\t\t\t    9,636,000   17,588,000\nAverage Shares:\n   Class A Common Stock\t\t   10,873,172   10,873,172\n   Class B Common Stock\t\t   12,001,793   13,215,106\nNet Income Per Share:\n   Class A Common Stock\t\t\

In [13]:
print(res["took"]) 
print(res["hits"]["total"])
print(res["hits"]["max_score"])
print(len(res['hits']['hits']))
print(res['hits']['hits'][2])

9
196
4.3328924
10
{'_index': 'rcv1', '_type': 'article', '_id': '670', '_score': 4.054195, '_source': {'id': '3031', 'date': '1996-08-20', 'lang': 'en', 'title': 'USA: Paracelsus, Champion complete merger.', 'headline': 'Paracelsus, Champion complete merger.', 'content': 'Paracelsus Healthcare Corp and Champion Healthcare Corp said on Tuesday in a joint statement that they completed a merger on August 16, 1996, that makes Champion a subsidiary of Paracelsus.\nEach Champion share was converted into one Paracelsus common share and each share of Champion preferred stock was converted into two shares of Paracelsus.\nParacelsus issued 19.8 million shares of common stock in the merger, bringing the total shares outstanding to 54.7 million including 5.2 million from a recent public offering, the companies said.\nPark Hospital GmbH, a German corporation wholly owned by chairman Dr. Manfred George Krukemeyer, owns 29,771,742 shares of Paracelsus common stock, the company said.\nConcurrently wi

In [14]:
retrieved_ordered =[res['hits']['hits'][i]["_source"]["id"] for i in range(len(res['hits']['hits']))]