### Notebook to test ElasticSearch on the corpus of articles

In [1]:
# Imports 
import requests, json
import pandas as pd
from tqdm import tqdm
from elasticsearch import Elasticsearch, helpers
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Connect to ElasticSearch
res = requests.get('http://localhost:9200')
if res.content:
    print('[+] Elasticsearch is ready for connection.')
try:
    es = Elasticsearch(hosts='http://localhost', PORT=9200, timeout=60, retry_on_timeout=True)
    print('[+] Elasticsearch successfully connected.')
except:
    print('[-] An error occured during connection to Elasticsearch.')

[+] Elasticsearch is ready for connection.
[+] Elasticsearch successfully connected.


In [4]:
# My Index
INDEX_NAME = 'articles_companies'

In [5]:
# Match all the document
query_all = {
    'query' : {
        'match_all' : { }
    }
}

In [6]:
# Get the number of documents in the index
resp = es.count(index=INDEX_NAME, body=query_all)
print("We have {} document in the index '{}'.".format(resp['count'], INDEX_NAME))

We have 324185 document in the index 'articles_companies'.


In [52]:
# Function to return to query to get 10 most significant word reguarding a company name
def query_significant_terms(company_name):
    query = {
        "size": 0, 
        "query": {
        "bool": {
          "must": [
            {
              "query_string": {
                "query": company_name,
                "fields": ["title", "full-text"]
              }
            }
          ]
        }
      },
      "aggs": {
        "sample": {
          "sampler": {
            "shard_size": 150000
          },
          "aggs": {
            "keywords": {
              "significant_text": {
                "field": "full-text",
                "include": '.*' + company_name.lower() + '.*',
                "size": 10
              }
            }
          }
        }
      } 
    }
    return query

In [53]:
# Test with one company Apple
my_company = 'Amazon'
query = query_significant_terms(my_company)
resp = es.search(index=INDEX_NAME, body=query)

resp
# Problem here, exlcude and inlcude seems to have revese operation : 
#   include works like an exlcude and exclude wroks like an include


{'took': 30704,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 7950, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'sample': {'doc_count': 7950,
   'keywords': {'doc_count': 7950,
    'bg_count': 324185,
    'buckets': [{'key': 'amzn.o',
      'doc_count': 1817,
      'score': 5.641238325053093,
      'bg_count': 2885},
     {'key': 'bezos',
      'doc_count': 979,
      'score': 3.970225654945654,
      'bg_count': 1201},
     {'key': 'rossignol',
      'doc_count': 743,
      'score': 3.1953080998381895,
      'bg_count': 861},
     {'key': 'lauwin',
      'doc_count': 584,
      'score': 2.9220559313318306,
      'bg_count': 584},
     {'key': 'planque',
      'doc_count': 584,
      'score': 2.9220559313318306,
      'bg_count': 584},
     {'key': 'warehouses',
      'doc_count': 1021,
      'score': 2.3961235522733912,
      'bg_count': 2118},
     {'key': 'microsoft',
      'doc_co

## Fetching companies names and related names

In [32]:
# Load companies lexical
companies = pd.read_excel (r'./data/comapny_name-related_words.xlsx', header = None, names=['text'])
# Lower
#df['text'] = [str(row).lower() for index, row in df.iterrows()] 
companies[['companies', "words"]] = companies.text.str.split(";", expand=True)
companies.drop(labels=['text'], axis = 1, inplace=True)
companies = companies.groupby('companies')['words'].apply(list).reset_index(name='lexic')

In [36]:
companies.companies[:5]

0          21st Century Fox
1       Activision Blizzard
2                    Adobe 
3    Advanced Micro Devices
4       Akamai Technologies
Name: companies, dtype: object

In [48]:
# Getting significant words for a company
def get_significant_terms(company):
    significant_terms = []
    query = query_significant_terms(company)
    try:
        response = es.search(index=INDEX_NAME, body=query)
        bucket = response['aggregations']['sample']['keywords']['buckets']
        print('[+] Elasticsearch query successfully sent adn received for {}.'.format(company))
    except:
        bucket = []
        print('[-] An error occured during querying Elasticsearch for {}.'.format(company))

    #bucket = response['aggregations']['sample']['keywords']['buckets']
    
    if not bucket:
        print('[-] An error occured for {}, the list of significant terms is empty.'.format(company))
    else:
        for i in range(len(bucket)):
            term = bucket[i]['key']
            significant_terms.append(term)
            
    return significant_terms

In [49]:
# Create the dictionnary with all the significatn terms
relevant_words = {company:get_significant_terms(company) for company in companies.companies}

[+] Elasticsearch query successfully sent adn received for 21st Century Fox.
[+] Elasticsearch query successfully sent adn received for Activision Blizzard.
[+] Elasticsearch query successfully sent adn received for Adobe .
[+] Elasticsearch query successfully sent adn received for Advanced Micro Devices.
[+] Elasticsearch query successfully sent adn received for Akamai Technologies.
[+] Elasticsearch query successfully sent adn received for Akamai Tecnologies.
[+] Elasticsearch query successfully sent adn received for Alexion Pharmaceuticals.
[+] Elasticsearch query successfully sent adn received for Alphabet.
[+] Elasticsearch query successfully sent adn received for Amazon.
[-] An error occured during querying Elasticsearch for American Airlines Group.
[-] An error occured for American Airlines Group, the list of significant terms is empty.
[+] Elasticsearch query successfully sent adn received for Amgen.
[+] Elasticsearch query successfully sent adn received for Analog Devices.
[+]

In [50]:
relevant_words

{'21st Century Fox': ['fox',
  'century',
  '21st',
  '19th',
  'white',
  'news',
  'trump',
  'house',
  'donald',
  'interview'],
 'Activision Blizzard': ['blizzard',
  'activision',
  'nasdaq:atvi',
  'atvi.o',
  'warcraft',
  'videogame',
  'warzone',
  "blizzard's",
  '4185',
  'kotick'],
 'Adobe ': ['adobe',
  'photoshop',
  'adbe.o',
  'athletica',
  'valueacts',
  '96.08',
  "adobe's",
  '9,588.81',
  '4185',
  '3,041.31'],
 'Advanced Micro Devices': ['advanced',
  'devices',
  'micro',
  'barda',
  'biomedical',
  'technology',
  'device',
  'apple',
  'research',
  'bluetooth'],
 'Akamai Technologies': ['technologies',
  'huawei',
  'uber',
  'huaweis',
  'uber.n',
  'hailing',
  'hwt.ul',
  'technology',
  'entity',
  'wanzhou'],
 'Akamai Tecnologies': ['akamai',
  'akam',
  'llnw',
  'nasdaq:akam',
  'cdn',
  "fastly's",
  'fastly',
  'nyse:fsly'],
 'Alexion Pharmaceuticals': ['pharmaceuticals',
  'regeneron',
  'inovio',
  'sanofi',
  'alexion',
  'kevzara',
  'regn.o',
 

In [51]:
# Output json file 
output_path = './data/relevant_words.json'
with open(output_path, "w") as outfile:  
    json.dump(relevant_words, outfile) 

In [17]:
# Read the json file line by line and append to the dict
import re
raw_json_data = []
#ArticleCompany_2020-11-17/corpus_check_long_SIREN_UPDATED2
with open('./data/extract.json', 'r+') as f:
    for line in f:
        current = re.sub("},", "}\n", line)
        f.write(line)

In [10]:
raw_json_data

['\t{\n',
 '        "id": "8",\n',
 '        "siren": "[419838529, 813883964]",\n',
 '        "corpus": "Ipsen lorgne les peptides de PeptiMimesis ",\n',
 '        "url_article": "http://www.boursier.com/actions/actualites/news/ipsen-lorgne-les-peptides-de-peptimimesis-677907.html"\n',
 '    },\n',
 '    {\n',
 '        "id": "2894",\n',
 '        "siren": "[419838529]",\n',
 '        "corpus": "Ipsen : accord important avec Probi   publie le 26/04/2016 a 08h51  [FR:FR0010259150]Ipsen[:FR] et Probi ont signe un accord de licence et d\'approvisionnement pour la commercialisation de la souche probiotique Lactobacillus plantarum 299v (LP299V) de Probi. Cet accord couvre 18 pays, principalement en Europe et dans les pays emergents. Ce probiotique, cliniquement documente et couvert par des brevets dans le domaine gastro-intestinal, a vocation a completer le solide portefeuille de gastroenterologie d\'Ipsen. Du point de vue de Probi, cet accord de distribution pourrait etre l\'un des plus im