### Notebook to test ElasticSearch on the corpus of articles

In [1]:
# Imports 
import requests, json
import random
import pandas as pd
from tqdm import tqdm
from elasticsearch import Elasticsearch, helpers
import warnings
warnings.filterwarnings('ignore')

In [7]:
# Connect to ElasticSearch
res = requests.get('http://localhost:9200')
if res.content:
    print('[+] Elasticsearch is ready for connection.')
try:
    es = Elasticsearch(hosts='http://localhost', PORT=9200, timeout=60, retry_on_timeout=True)
    print('[+] Elasticsearch successfully connected.')
except:
    print('[-] An error occured during connection to Elasticsearch.')

[+] Elasticsearch is ready for connection.
[+] Elasticsearch successfully connected.


In [33]:
# My Index
INDEX_NAME = 'articles_companies'

In [34]:
# Match all the document
query_all = {
    'query' : {
        'match_all' : { }
    }
}

In [35]:
# Get the number of documents in the index
resp = es.count(index=INDEX_NAME, body=query_all)
print("We have {} document in the index '{}'.".format(resp['count'], INDEX_NAME))

We have 324185 document in the index 'articles_companies'.


In [6]:
# Function to return to query to get 10 most significant word reguarding a company name
def query_significant_terms(company_name):
    query = {
        "size": 0, 
        "query": {
        "bool": {
          "must": [
            {
              "query_string": {
                "query": company_name,
                "fields": ["title", "full-text"]
              }
            }
          ]
        }
      },
      "aggs": {
        "sample": {
          "sampler": {
            "shard_size": 150000
          },
          "aggs": {
            "keywords": {
              "significant_text": {
                "field": "full-text",
                "include": '.*' + company_name.lower() + '.*',
                "size": 10
              }
            }
          }
        }
      } 
    }
    return query

In [62]:
# Test with one company Apple
my_company = 'Amazon'
query = query_significant_terms(my_company)
resp = es.search(index=INDEX_NAME, body=query)

resp
# Problem here, exlcude and inlcude seems to have revese operation : 
#   include works like an exlcude and exclude wroks like an include


{'took': 30731,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 7950, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'sample': {'doc_count': 7950,
   'keywords': {'doc_count': 7950,
    'bg_count': 324185,
    'buckets': [{'key': 'amzn.o',
      'doc_count': 1817,
      'score': 5.641238325053093,
      'bg_count': 2885},
     {'key': 'bezos',
      'doc_count': 979,
      'score': 3.970225654945654,
      'bg_count': 1201},
     {'key': 'rossignol',
      'doc_count': 743,
      'score': 3.1953080998381895,
      'bg_count': 861},
     {'key': 'lauwin',
      'doc_count': 584,
      'score': 2.9220559313318306,
      'bg_count': 584},
     {'key': 'planque',
      'doc_count': 584,
      'score': 2.9220559313318306,
      'bg_count': 584},
     {'key': 'warehouses',
      'doc_count': 1021,
      'score': 2.3961235522733912,
      'bg_count': 2118},
     {'key': 'microsoft',
      'doc_co

## Fetching companies names and related names

In [63]:
# Load companies lexical
companies = pd.read_excel (r'./data/comapny_name-related_words.xlsx', header = None, names=['text'])
# Lower
#df['text'] = [str(row).lower() for index, row in df.iterrows()] 
companies[['companies', "words"]] = companies.text.str.split(";", expand=True)
companies.drop(labels=['text'], axis = 1, inplace=True)
companies = companies.groupby('companies')['words'].apply(list).reset_index(name='lexic')

In [64]:
companies.companies[:5]

0          21st Century Fox
1       Activision Blizzard
2                    Adobe 
3    Advanced Micro Devices
4       Akamai Technologies
Name: companies, dtype: object

In [65]:
# Getting significant words for a company
def get_significant_terms(company):
    significant_terms = []
    query = query_significant_terms(company)
    try:
        response = es.search(index=INDEX_NAME, body=query)
        bucket = response['aggregations']['sample']['keywords']['buckets']
        #print('[+] Elasticsearch query successfully sent and received for {}.'.format(company))
    except:
        bucket = []
        print('[-] An error occured during querying Elasticsearch for {}.'.format(company))
    
    if bucket:
        for i in range(len(bucket)):
            term = bucket[i]['key']
            significant_terms.append(term)
            
    return significant_terms

In [49]:
# Create the dictionnary with all the significatn terms
relevant_words = {company:get_significant_terms(company) for company in companies.companies}

[+] Elasticsearch query successfully sent adn received for 21st Century Fox.
[+] Elasticsearch query successfully sent adn received for Activision Blizzard.
[+] Elasticsearch query successfully sent adn received for Adobe .
[+] Elasticsearch query successfully sent adn received for Advanced Micro Devices.
[+] Elasticsearch query successfully sent adn received for Akamai Technologies.
[+] Elasticsearch query successfully sent adn received for Akamai Tecnologies.
[+] Elasticsearch query successfully sent adn received for Alexion Pharmaceuticals.
[+] Elasticsearch query successfully sent adn received for Alphabet.
[+] Elasticsearch query successfully sent adn received for Amazon.
[-] An error occured during querying Elasticsearch for American Airlines Group.
[-] An error occured for American Airlines Group, the list of significant terms is empty.
[+] Elasticsearch query successfully sent adn received for Amgen.
[+] Elasticsearch query successfully sent adn received for Analog Devices.
[+]

In [50]:
relevant_words

{'21st Century Fox': ['fox',
  'century',
  '21st',
  '19th',
  'white',
  'news',
  'trump',
  'house',
  'donald',
  'interview'],
 'Activision Blizzard': ['blizzard',
  'activision',
  'nasdaq:atvi',
  'atvi.o',
  'warcraft',
  'videogame',
  'warzone',
  "blizzard's",
  '4185',
  'kotick'],
 'Adobe ': ['adobe',
  'photoshop',
  'adbe.o',
  'athletica',
  'valueacts',
  '96.08',
  "adobe's",
  '9,588.81',
  '4185',
  '3,041.31'],
 'Advanced Micro Devices': ['advanced',
  'devices',
  'micro',
  'barda',
  'biomedical',
  'technology',
  'device',
  'apple',
  'research',
  'bluetooth'],
 'Akamai Technologies': ['technologies',
  'huawei',
  'uber',
  'huaweis',
  'uber.n',
  'hailing',
  'hwt.ul',
  'technology',
  'entity',
  'wanzhou'],
 'Akamai Tecnologies': ['akamai',
  'akam',
  'llnw',
  'nasdaq:akam',
  'cdn',
  "fastly's",
  'fastly',
  'nyse:fsly'],
 'Alexion Pharmaceuticals': ['pharmaceuticals',
  'regeneron',
  'inovio',
  'sanofi',
  'alexion',
  'kevzara',
  'regn.o',
 

In [51]:
# Output json file 
output_path = './data/relevant_words.json'
with open(output_path, "w") as outfile:  
    json.dump(relevant_words, outfile) 

### French articles

In [12]:
# fetching data
PATH = "./data/ArticleCompany_2020-11-17/"
corpus = "corpus_check_long_SIREN_UPDATED2"
names = "siren_name_map_clean"

In [13]:
#df_name = pd.read_json(PATH + names +".json")
with open(PATH + names +".json") as json_file: 
    dict_names = json.load(json_file) 

with open(PATH + corpus +".json") as json_file: 
    dict_corpus = json.load(json_file) 

In [20]:
len(dict_corpus)

57540

In [21]:
dict_corpus[0]

{'id': '8',
 'siren': '[419838529, 813883964]',
 'corpus': 'Ipsen lorgne les peptides de PeptiMimesis ',
 'url_article': 'http://www.boursier.com/actions/actualites/news/ipsen-lorgne-les-peptides-de-peptimimesis-677907.html'}

In [14]:
def get_company_name(siren):
    try:
        name = dict_names[siren]
    except:
        name = ''
    return name
    
# Append Compay name to di
for corpus in dict_corpus:
    sirens = corpus['siren'][1:-1].split(',')
    sirens = [element.strip() for element in sirens]
    names = [get_company_name(element) for element in sirens] 
    corpus['companies'] = names

In [23]:
dict_corpus[0]

{'id': '8',
 'siren': '[419838529, 813883964]',
 'corpus': 'Ipsen lorgne les peptides de PeptiMimesis ',
 'url_article': 'http://www.boursier.com/actions/actualites/news/ipsen-lorgne-les-peptides-de-peptimimesis-677907.html',
 'companies': ['IPSEN', 'PEPTIMIMESIS']}

### Create Train and Test files

In [15]:
size = len(dict_corpus)
train_size = (int) (0.7 * size)
print(train_size)

train_data = dict_corpus[:train_size]
test_data = dict_corpus[train_size:]

40278


In [16]:
# Output json corpus file 
output_train = './data/french_articles/train_fr_articles.json'
output_test = './data/french_articles/test_fr_articles.json'

with open(output_train, "w") as outfile:  
    for line in train_data:
        outfile.write(json.dumps(line))
        outfile.write('\n')

with open(output_test, "w") as outfile:  
    for line in test_data:
        outfile.write(json.dumps(line))
        outfile.write('\n')

## Dictionnary for french companies

In [41]:
# My index
INDEX_NAME_FR = 'french_articles'

In [40]:
# Listing all the companies
french_companies = list(dict_names.values())
print("We have {} french companies.".format(len(french_companies)))

We have 30178 french companies.


In [42]:
# Get the number of documents in the index
resp = es.count(index=INDEX_NAME_FR, body=query_all)
print("We have {} document in the index '{}'.".format(resp['count'], INDEX_NAME_FR))

We have 57540 document in the index 'french_articles'.


In [8]:
# Function to return to query to get 10 most significant word reguarding a company name
def query_significant_terms_fr(company_name):
    query = {
        "size": 0, 
        "query": {
        "bool": {
          "must": [
            {
              "query_string": {
                "query": company_name,
                "fields": ["companies"]
              }
            }
          ]
        }
      },
      "aggs": {
        "sample": {
          "sampler": {
            "shard_size": 150000
          },
          "aggs": {
            "keywords": {
              "significant_text": {
                "field": "corpus",
                "include": '.*' + company_name.lower() + '.*',
                "size": 20
              }
            }
          }
        }
      } 
    }
    return query

In [19]:
# Getting significant words for a company
def get_significant_terms_fr(company, index):
    significant_terms = list()
    query = query_significant_terms_fr(company)
    try:
        response = es.search(index=index, body=query)
        bucket = response['aggregations']['sample']['keywords']['buckets']
        #print('[+] Elasticsearch query successfully sent and received for {}.'.format(company))
    except:
        bucket = []
        #print('[-] An error occured during querying Elasticsearch for {}.'.format(company))
    
    if bucket:
        for i in range(len(bucket)):
            term = list()
            term.append(bucket[i]['key'])
            term.append(bucket[i]['score'])
            significant_terms.append(term)
            
    return significant_terms

In [None]:
# Create the dictionnary with all the significatn terms
relevant_words_fr = {company:get_significant_terms_fr(company, INDEX_NAME_FR) for company in tqdm(french_companies)}


### Create Dictionnary from training set

In [20]:
INDEX_FR = 'train_fr_articles'

In [21]:
%%time
relevant_words_train = {siren:get_significant_terms_fr(dict_names[siren], INDEX_FR) for siren in dict_names}

CPU times: user 26.7 s, sys: 5.29 s, total: 32 s
Wall time: 10h 20min 47s


In [2]:
relevant_words_train

NameError: name 'relevant_words_train' is not defined

In [22]:
# Output json file 
output_path = './relevant_words/francais/relevant_words_train.json'
with open(output_path, "w") as outfile:  
    json.dump(relevant_words_train, outfile) 

#### Now we label the test set with those labels and evaluate it

In [134]:
def labeling(text, relevant_words):
    labels = []
    for siren in relevant_words:
        if dict_names[siren].lower() in text.lower():
            labels.append(siren)
        else:
            for related_name in relevant_words[siren]:
                if related_name.lower() in text.lower() and siren not in labels:
                    labels.append(siren)
                    break
    return labels

In [135]:
# Test set
output_test = './data/french_articles/test_fr_articles.json'
y_test = list()
y_corpuses = list()
with open(output_test, "r") as outfile:  
    for line in test_data:
        tmp = line['siren'][1:-1].split(',')
        y_test.append([element.strip() for element in tmp])
        y_corpuses.append(line['corpus'])

In [136]:
labels = [labeling(text, relevant_words_train) for text in y_corpuses]


KeyboardInterrupt: 

### Accuracy of the relevant terms

In [None]:
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(predicted)):
        if predicted[i]:
            for label in predicted[i]:
                if label in actual[i]:
                    correct += 1
                    break
    return correct / float(len(actual))

In [None]:
print('The accuracy of the relevant terms list is : ')
print(accuracy_metric(y_test, labels))