### Notebook to test ElasticSearch on the corpus of articles

In [1]:
# Imports 
import requests, json
import random
import pandas as pd
from tqdm import tqdm
from elasticsearch import Elasticsearch, helpers
import warnings
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [2]:
# Connect to ElasticSearch
res = requests.get('http://localhost:9200')
if res.content:
    print('[+] Elasticsearch is ready for connection.')
try:
    es = Elasticsearch(hosts='http://localhost', PORT=9200, timeout=60, retry_on_timeout=True)
    print('[+] Elasticsearch successfully connected.')
except:
    print('[-] An error occured during connection to Elasticsearch.')

[+] Elasticsearch is ready for connection.
[+] Elasticsearch successfully connected.


In [33]:
# My Index
INDEX_NAME = 'articles_companies'

In [41]:
# Match all the document
query_all = {
    'query' : {
        'match_all' : { }
    }
}

In [35]:
# Get the number of documents in the index
resp = es.count(index=INDEX_NAME, body=query_all)
print("We have {} document in the index '{}'.".format(resp['count'], INDEX_NAME))

We have 324185 document in the index 'articles_companies'.


In [6]:
# Function to return to query to get 10 most significant word reguarding a company name
def query_significant_terms(company_name):
    query = {
        "size": 0, 
        "query": {
        "bool": {
          "must": [
            {
              "query_string": {
                "query": company_name,
                "fields": ["title", "full-text"]
              }
            }
          ]
        }
      },
      "aggs": {
        "sample": {
          "sampler": {
            "shard_size": 150000
          },
          "aggs": {
            "keywords": {
              "significant_text": {
                "field": "full-text",
                "include": '.*' + company_name.lower() + '.*',
                "size": 10
              }
            }
          }
        }
      } 
    }
    return query

In [62]:
# Test with one company Apple
my_company = 'Amazon'
query = query_significant_terms(my_company)
resp = es.search(index=INDEX_NAME, body=query)

resp
# Problem here, exlcude and inlcude seems to have revese operation : 
#   include works like an exlcude and exclude wroks like an include


{'took': 30731,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 7950, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'sample': {'doc_count': 7950,
   'keywords': {'doc_count': 7950,
    'bg_count': 324185,
    'buckets': [{'key': 'amzn.o',
      'doc_count': 1817,
      'score': 5.641238325053093,
      'bg_count': 2885},
     {'key': 'bezos',
      'doc_count': 979,
      'score': 3.970225654945654,
      'bg_count': 1201},
     {'key': 'rossignol',
      'doc_count': 743,
      'score': 3.1953080998381895,
      'bg_count': 861},
     {'key': 'lauwin',
      'doc_count': 584,
      'score': 2.9220559313318306,
      'bg_count': 584},
     {'key': 'planque',
      'doc_count': 584,
      'score': 2.9220559313318306,
      'bg_count': 584},
     {'key': 'warehouses',
      'doc_count': 1021,
      'score': 2.3961235522733912,
      'bg_count': 2118},
     {'key': 'microsoft',
      'doc_co

## Fetching companies names and related names

In [63]:
# Load companies lexical
companies = pd.read_excel (r'./data/comapny_name-related_words.xlsx', header = None, names=['text'])
# Lower
#df['text'] = [str(row).lower() for index, row in df.iterrows()] 
companies[['companies', "words"]] = companies.text.str.split(";", expand=True)
companies.drop(labels=['text'], axis = 1, inplace=True)
companies = companies.groupby('companies')['words'].apply(list).reset_index(name='lexic')

In [64]:
companies.companies[:5]

0          21st Century Fox
1       Activision Blizzard
2                    Adobe 
3    Advanced Micro Devices
4       Akamai Technologies
Name: companies, dtype: object

In [65]:
# Getting significant words for a company
def get_significant_terms(company):
    significant_terms = []
    query = query_significant_terms(company)
    try:
        response = es.search(index=INDEX_NAME, body=query)
        bucket = response['aggregations']['sample']['keywords']['buckets']
        #print('[+] Elasticsearch query successfully sent and received for {}.'.format(company))
    except:
        bucket = []
        print('[-] An error occured during querying Elasticsearch for {}.'.format(company))
    
    if bucket:
        for i in range(len(bucket)):
            term = bucket[i]['key']
            significant_terms.append(term)
            
    return significant_terms

In [49]:
# Create the dictionnary with all the significatn terms
relevant_words = {company:get_significant_terms(company) for company in companies.companies}

[+] Elasticsearch query successfully sent adn received for 21st Century Fox.
[+] Elasticsearch query successfully sent adn received for Activision Blizzard.
[+] Elasticsearch query successfully sent adn received for Adobe .
[+] Elasticsearch query successfully sent adn received for Advanced Micro Devices.
[+] Elasticsearch query successfully sent adn received for Akamai Technologies.
[+] Elasticsearch query successfully sent adn received for Akamai Tecnologies.
[+] Elasticsearch query successfully sent adn received for Alexion Pharmaceuticals.
[+] Elasticsearch query successfully sent adn received for Alphabet.
[+] Elasticsearch query successfully sent adn received for Amazon.
[-] An error occured during querying Elasticsearch for American Airlines Group.
[-] An error occured for American Airlines Group, the list of significant terms is empty.
[+] Elasticsearch query successfully sent adn received for Amgen.
[+] Elasticsearch query successfully sent adn received for Analog Devices.
[+]

In [50]:
relevant_words

{'21st Century Fox': ['fox',
  'century',
  '21st',
  '19th',
  'white',
  'news',
  'trump',
  'house',
  'donald',
  'interview'],
 'Activision Blizzard': ['blizzard',
  'activision',
  'nasdaq:atvi',
  'atvi.o',
  'warcraft',
  'videogame',
  'warzone',
  "blizzard's",
  '4185',
  'kotick'],
 'Adobe ': ['adobe',
  'photoshop',
  'adbe.o',
  'athletica',
  'valueacts',
  '96.08',
  "adobe's",
  '9,588.81',
  '4185',
  '3,041.31'],
 'Advanced Micro Devices': ['advanced',
  'devices',
  'micro',
  'barda',
  'biomedical',
  'technology',
  'device',
  'apple',
  'research',
  'bluetooth'],
 'Akamai Technologies': ['technologies',
  'huawei',
  'uber',
  'huaweis',
  'uber.n',
  'hailing',
  'hwt.ul',
  'technology',
  'entity',
  'wanzhou'],
 'Akamai Tecnologies': ['akamai',
  'akam',
  'llnw',
  'nasdaq:akam',
  'cdn',
  "fastly's",
  'fastly',
  'nyse:fsly'],
 'Alexion Pharmaceuticals': ['pharmaceuticals',
  'regeneron',
  'inovio',
  'sanofi',
  'alexion',
  'kevzara',
  'regn.o',
 

In [51]:
# Output json file 
output_path = './data/relevant_words.json'
with open(output_path, "w") as outfile:  
    json.dump(relevant_words, outfile) 

### French articles

In [11]:
# fetching data
PATH = "./data/ArticleCompany_2020-11-17/"
corpus = "corpus_check_long_SIREN_UPDATED2"
names = "siren_name_map_clean"

In [12]:
#df_name = pd.read_json(PATH + names +".json")
with open(PATH + names +".json") as json_file: 
    dict_names = json.load(json_file) 

with open(PATH + corpus +".json") as json_file: 
    dict_corpus = json.load(json_file) 

In [6]:
len(dict_corpus)

57540

In [9]:
dict_corpus[0]

{'id': '8',
 'siren': '[419838529, 813883964]',
 'corpus': 'Ipsen lorgne les peptides de PeptiMimesis ',
 'url_article': 'http://www.boursier.com/actions/actualites/news/ipsen-lorgne-les-peptides-de-peptimimesis-677907.html'}

In [10]:
def get_company_name(siren):
    try:
        name = dict_names[siren]
    except:
        name = ''
    return name
    
# Append Compay name to di
for corpus in dict_corpus:
    sirens = corpus['siren'][1:-1].split(',')
    sirens = [element.strip() for element in sirens]
    names = [get_company_name(element) for element in sirens] 
    corpus['companies'] = names

In [11]:
dict_corpus[0]

{'id': '8',
 'siren': '[419838529, 813883964]',
 'corpus': 'Ipsen lorgne les peptides de PeptiMimesis ',
 'url_article': 'http://www.boursier.com/actions/actualites/news/ipsen-lorgne-les-peptides-de-peptimimesis-677907.html',
 'companies': ['IPSEN', 'PEPTIMIMESIS']}

### Create Train and Test files

In [7]:
size = len(dict_corpus)
train_size = (int) (0.7 * size)
print(train_size)

train_data = dict_corpus[:train_size]
test_data = dict_corpus[train_size:]

40278


In [12]:
test_data[0]['corpus'].lower()

"largilliere finance conseille chemins d'oceans dans la reprise du concessionnaire de bateaux massif marine , fusacq buzz  groupe chemins d'oceans (c.a. : 24 meur) - dirige par pierre trauchessec - accelere sa croissance en rachetant groupe massif marine (c.a. : 14 meur), un acteur majeur de la plaisance sur la facade atlantique francaise  groupe chemins d'oceans, un leader de la plaisance de la cote mediterraneenne francaise, acquiert la totalite de groupe massif marine. cette nouvelle acquisition intervient moins de deux ans apres celle de technic marine intervenue en novembre 2015. cree en 1980, groupe massif marine est un concessionnaire exclusif de bateaux de plaisance des marques beneteau et four winns du groupe beneteau - 1er constructeur mondial de bateaux de plaisance. avec cinq implantations, il s'est developpe dans des zones strategiques comme la baie de quiberon et la cote d'azur.  fort d'un chiffre d'affaires de pres de 14 meur en 2016, de l'exclusivite de ses contrats de 

In [16]:
# Output json corpus file 
output_train = './data/french_articles/train_fr_articles.json'
output_test = './data/french_articles/test_fr_articles.json'

with open(output_train, "w") as outfile:  
    for line in train_data:
        outfile.write(json.dumps(line))
        outfile.write('\n')

with open(output_test, "w") as outfile:  
    for line in test_data:
        outfile.write(json.dumps(line))
        outfile.write('\n')

## Dictionnary for french companies

In [12]:
# My index
INDEX_NAME_FR = 'clean_fr_articles'

In [18]:
# Listing all the companies
french_companies = list(dict_names.values())
print("We have {} french companies.".format(len(french_companies)))
dict_names

We have 30178 french companies.


{'419838529': 'IPSEN',
 '813883964': 'PEPTIMIMESIS',
 '572060333': 'MERSEN',
 '542104245': 'BANQUE PALATINE',
 '399258755': 'SPIE OPERATIONS',
 '542107651': 'ENGIE',
 '552030967': 'VALEO',
 '392032934': 'MSD VACCINS',
 '328479753': 'IDI',
 '799730759': 'GROUPE WINNCARE',
 '332822485': 'HAULOTTE GROUP',
 '351571757': 'GL EVENTS',
 '481480465': 'SAFT GROUPE SA',
 '317540581': 'TRANSGENE',
 '395030844': 'SANOFI',
 '652014051': 'CARREFOUR',
 '342376332': 'ILIAD',
 '390474898': 'CHARGEURS',
 '310879499': 'BOURBON CORPORATION',
 '702012956': 'ALTRAN TECHNOLOGIES',
 '428783872': 'CREDIXIA',
 '562018002': 'GAUMONT',
 '352045454': "FINANCIERE DE L'ECHIQUIER",
 '420713935': 'ERAAM',
 '811849595': 'IENA VENTURE',
 '480772326': 'REALITES PROMOTION',
 '441378312': 'DAMARTEX',
 '331408336': 'NEURONES',
 '335480877': 'ALTAREA',
 '325520013': 'PSB INDUSTRIES',
 '476980362': 'SOMFY SA',
 '348033473': 'MR BRICOLAGE',
 '393341516': 'AIRBUS DEFENCE AND SPACE SAS',
 '784393530': 'RUBIS',
 '433234325': 'GRO

In [42]:
# Get the number of documents in the index
resp = es.count(index=INDEX_NAME_FR, body=query_all)
print("We have {} document in the index '{}'.".format(resp['count'], INDEX_NAME_FR))

We have 57538 document in the index 'clean_fr_articles'.


In [22]:
# Function to return to query to get 10 most significant word reguarding a company name
def query_significant_terms_fr(company_name):
    query = {
        "size": 0, 
        "query": {
        "bool": {
          "must": [
            {
              "query_string": {
                "query": company_name,
                "fields": ["companies"]
              }
            }
          ]
        }
      },
      "aggs": {
        "sample": {
          "sampler": {
            "shard_size": 150000
          },
          "aggs": {
            "keywords": {
              "significant_text": {
                "field": "corpus",
                "size": 500
              }
            }
          }
        }
      } 
    }
    return query

In [20]:
# Getting significant words for a company
def get_significant_terms_fr(company, index):
    significant_terms = list()
    query = query_significant_terms_fr(company)
    try:
        response = es.search(index=index, body=query)
        bucket = response['aggregations']['sample']['keywords']['buckets']
        #print('[+] Elasticsearch query successfully sent and received for {}.'.format(company))
    except:
        bucket = []
        #print('[-] An error occured during querying Elasticsearch for {}.'.format(company))
    
    if bucket:
        for i in range(len(bucket)):
            if i > 20:
                break
            term = list()
            term.append(bucket[i]['key'])
            term.append(bucket[i]['score'])
            significant_terms.append(term)
            
    return significant_terms

In [23]:
%%time
# Create the dictionnary with all the significatn terms
relevant_words_fr_VF = {siren:get_significant_terms_fr(dict_names[siren], INDEX_NAME_FR) for siren in dict_names}


CPU times: user 38.5 s, sys: 11.3 s, total: 49.8 s
Wall time: 19h 25min 42s


In [24]:
# Output json file 
output_path = './relevant_words/francais/relevant_words_VF.json'
with open(output_path, "w") as outfile:  
    json.dump(relevant_words_fr_VF, outfile) 

In [27]:
input_path = './relevant_words/francais/relevant_words_VF.json'
with open(input_path) as json_file: 
    dict_relevant_words = json.load(json_file) 

In [16]:
# Companies with more than 5 articles
len(companies_count_gt_5)

2084

In [30]:
dict_relevant_words_gt_5 = {key:value for key, value in dict_relevant_words.items() if key in companies_count_gt_5}

In [55]:
import collections
dict_distrib = {}
nb_lt_20 = 0
for key, value in dict_relevant_words_gt_5.items():
    lg = len(value)
    if lg < 6:
        nb_lt_20 += 1
    name = str(lg) + '_words'
    if name not in dict_distrib.keys():
        dict_distrib[name] = 1
    else:
        dict_distrib[name] += 1
    
print("There are {} companies with less than 20 relevant words".format(nb_lt_20))

There are 14 companies with less than 20 relevant words


In [52]:
dict_distrib

{'21_words': 2002,
 '17_words': 5,
 '19_words': 9,
 '0_words': 4,
 '10_words': 2,
 '13_words': 2,
 '15_words': 6,
 '20_words': 7,
 '12_words': 3,
 '6_words': 1,
 '5_words': 2,
 '14_words': 5,
 '8_words': 4,
 '1_words': 2,
 '3_words': 2,
 '11_words': 5,
 '18_words': 2,
 '9_words': 2,
 '16_words': 1,
 '4_words': 2,
 '2_words': 2}

In [34]:
dict_distrib

{'21_words': 19406,
 '0_words': 8842,
 '3_words': 128,
 '4_words': 129,
 '10_words': 90,
 '15_words': 67,
 '19_words': 66,
 '17_words': 57,
 '5_words': 119,
 '2_words': 162,
 '11_words': 98,
 '9_words': 79,
 '1_words': 226,
 '13_words': 73,
 '12_words': 80,
 '6_words': 121,
 '14_words': 71,
 '20_words': 54,
 '7_words': 90,
 '18_words': 74,
 '16_words': 59,
 '8_words': 87}

In [56]:
for key, value in dict_relevant_words_gt_5.items():
    lg = len(value)
    if lg == 2:
        print(key, '-->', value)

421197005 --> [['pariscityvision', 4044.890625], ['directoire', 7.5732134086444]]
380656439 --> [['magazine', 1.9986688290382757], ['media', 0.6376529889901774]]


### Create Dictionnary from training set

In [20]:
INDEX_FR = 'train_fr_articles'

In [23]:
output_path = './relevant_words/francais/relevant_words_train.json'
with open(output_path, "r") as infile:  
    relevant_words_train = json.load(infile)

In [25]:
relevant_words_train['419838529']

[['meek', 513.3928571428571],
 ['somatuline', 365.04761904761904],
 ['onivyde', 205.3571428571428],
 ["d'onivyde", 164.25714285714284],
 ['smecta', 164.25714285714284],
 ['cabometyx', 154.0178571428571],
 ['decapeptyl', 154.0178571428571],
 ['garidel', 154.0178571428571],
 ['merrimack', 136.85714285714283],
 ['rein', 74.58441558441557],
 ['gastro', 65.83928571428571]]

## Compter Nombre d'Entreprises sans Articles

In [13]:
# Get companies labeled at least 5 times
dict_count = dict()
#for company in dict_names.keys(): dict_count[company] = 0
for document in dict_corpus:
    #print (document["siren"])
    sir_list = document["siren"][1:-1].split(", ")
    #print (sir_list)
    for siren in sir_list:
        #print (siren)
        if siren in dict_count.keys():
            dict_count[siren] +=1
        else:
            dict_count[siren] = 1
print ("There are",len(dict_count.keys()),"companies with labels out of the", len(dict_names.keys()), "companies")
print ("there are",len(dict_names.keys())-len(dict_count.keys()),"companies with no articles")
print (round(len(dict_count)/(len(dict_names))*100,2),"% of the companies have articles")

There are 28690 companies with labels out of the 30178 companies
there are 1488 companies with no articles
95.07 % of the companies have articles


In [29]:
## Quels sont les entreprises sans articles?
dict_no_acticle_companies = dict()
for company in dict_names.keys():
    if company not in dict_count.keys():
        dict_no_acticle_companies[company] = dict_names[company] 

#### Etudes du nombre d'articles associer a chaque entreprise 

In [32]:
# On prendre seulement les entreprises avec au moins un articles associer
#sns.set(rc={'figure.figsize':(40,5)})
values = list(dict_count.values())
#sns.displot(values, binwidth=3) #bins=20

number = 5
print(stats.describe(values))
print ("There are",round(values.count(1)/len(values)*100,2), "% articles with one associated article")
under_n = [1 for i in values if i < number]
print ("There are",round(len(under_n)/len(values)*100,2), "% articles with less than",number,"associated article")

DescribeResult(nobs=28690, minmax=(1, 175), mean=2.502195887068665, variance=28.018556298899046, skewness=11.346985559301654, kurtosis=211.55000716780165)
There are 63.58 % articles with one associated article
There are 90.42 % articles with less than 5 associated article


In [60]:
# Liste des entreprises avec plus de 5 articles associés
companies_count_gt_5 = [key for key in dict_count if dict_count[key]>5]
companies_count_gt_7 = [key for key in dict_count if dict_count[key]>7]
dict_count['419838529']

26

#### Now we label the test set with those labels and evaluate it

In [46]:
def labeling(text, relevant_words):
    labels = []
    for siren in relevant_words:
        if dict_names[siren].lower() in text.lower():
            labels.append(siren)
        else:
            for related_name in relevant_words[siren]:
                if related_name[0].lower() in text.lower() and siren not in labels:
                    labels.append(siren)
                    break
    return labels

In [37]:
# Test set
output_test = './data/french_articles/test_fr_articles.json'
y_test = list()
y_corpuses = list()
with open(output_test, "r") as outfile:  
    for line in test_data:
        tmp = line['siren'][1:-1].split(',')
        y_test.append([element.strip() for element in tmp])
        y_corpuses.append(line['corpus'])

In [39]:
words_comp_gt_5 = {key:relevant_words_train[key] for key in relevant_words_train if key in companies_count_gt_5}
words_comp_gt_5

{'419838529': [['meek', 513.3928571428571],
  ['somatuline', 365.04761904761904],
  ['onivyde', 205.3571428571428],
  ["d'onivyde", 164.25714285714284],
  ['smecta', 164.25714285714284],
  ['cabometyx', 154.0178571428571],
  ['decapeptyl', 154.0178571428571],
  ['garidel', 154.0178571428571],
  ['merrimack', 136.85714285714283],
  ['rein', 74.58441558441557],
  ['gastro', 65.83928571428571]],
 '572060333': [['themelin', 393.02734375],
  ['graphite', 279.6216681985294],
  ['ftcap', 235.81640624999997],
  ['idealec', 196.513671875],
  ["d'idealec", 117.90820312499999],
  ['cirprotec', 117.90820312499999],
  ['busbars', 88.40771484375],
  ['lgi', 88.40771484375],
  ['louisville', 58.9072265625],
  ['materials', 30.57342529296875],
  ['electrical', 26.420766469594593],
  ['silicium', 22.192604758522723],
  ['carbure', 22.0316162109375],
  ['sic', 22.0316162109375],
  ['courante', 20.743670886075947]],
 '542104245': [['postale', 20.75891877076981],
  ['uff', 18.35782617545288],
  ["l'uff", 

In [47]:
%%time 
labels = [labeling(text, words_comp_gt_5) for text in y_corpuses]


CPU times: user 28min 16s, sys: 13.5 s, total: 28min 30s
Wall time: 29min 29s


In [54]:
len(labels[5])


665

In [50]:
len(y_test)

17262

### Accuracy of the relevant terms

In [None]:
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(predicted)):
        if predicted[i]:
            for label in predicted[i]:
                if label in actual[i]:
                    correct += 1
                    break
    return correct / float(len(actual))

In [None]:
print('The accuracy of the relevant terms list is : ')
print(accuracy_metric(y_test, labels))

### Articles nettoyés

In [23]:
input_file = './data/corpus_nouns.json'
with open(input_file, "r") as infile:  
    corpus_cleaned = json.load(infile)

In [24]:
for line in corpus_cleaned:
    sentence = ' '.join([element for element in line['corpus']])
    line['corpus'] = sentence

In [26]:
def get_company_name(siren):
    try:
        name = dict_names[siren]
    except:
        name = ''
    return name
    
# Append Compay name to di
for corpus in corpus_cleaned:
    sirens = corpus['siren']
    names = [get_company_name(element) for element in sirens] 
    corpus['companies'] = names

In [27]:
# Output json corpus file 
output_path = './data/fr_cleaned_articles.json'
with open(output_path, "w", newline='\r\n') as outfile:  
    for line in corpus_cleaned:
        outfile.write(json.dumps(line))
        outfile.write('\n')