<a href="https://colab.research.google.com/github/fyedro/ExamplesGRLC/blob/master/notebooks/similarity_organisations_ex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

![TBFY-Banner](https://raw.githubusercontent.com/TBFY/general/master/figures/tbfy-banner.png)

# **CERVED Use Case - Given a text in a language, a list of awarded organisations in similar procurement processes will be shown.**

### Extended Functionality (run it only once)

It is required to handle short texts: 


In [0]:
!pip install pytextrank
!pip install langdetect
!pip install pysolr
!python -m spacy download it
!python -m spacy download en
!python -m spacy download es
!python -m spacy download fr
!python -m spacy download pt

import spacy
import pytextrank
import pysolr
import it_core_news_sm
from langdetect import detect

def get_language(text):
  try:
    lang = detect(text)
    return lang
  except:
    return "en"

def get_keywords(text):
  # load a spaCy model, depending on language, scale, etc.
  lang = get_language(text)
  nlp = spacy.load(lang)
  # add PyTextRank to the spaCy pipeline
  tr = pytextrank.TextRank()
  nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
  doc = nlp(text)
  # examine the top-ranked phrases in the document
  key_words=[]
  for p in doc._.phrases:
      #print("{:.4f} {:5d}  {}".format(p.rank, p.count, p.text))
      key_words.append(p.text)
  return key_words

def get_topics(document,levels):
  topics = []
  for level in levels:
    if (level in document):
      for topic in document[level].split(" "):
        topics.append(level+":"+topic)
  filter = " AND ".join(topics)
  return filter


def retrieve_documents_from_long_text(request):
  print("making the query to the SearchAPI..")
  base_url = 'https://tbfy.librairy.linkeddata.es/search-api'
  json_request = {"source": request['source'], "size": request['size'], "text" : request['text'], "lang": request['lang'] }
  resp = requests.post(base_url+'/items', json=json_request)
  if resp.status_code != 200:
    # This means something went wrong.
    print('POST /items/ {}'.format(resp.status_code))
    pass
  documents = []
  for doc in resp.json():    
    documents.append({"id":doc['id'], "name_s":doc['name']})    
  return documents  

def retrieve_documents_from_short_text(filter_params):
  print("additional steps to handle short text as input...")
  # Setup a Solr instance. The timeout is optional.
  solr = pysolr.Solr('http://librairy.linkeddata.es/solr/tbfy', timeout=10)

  # compose a solr query to retrieve the most relevant documents from those keywords
  keywords = get_keywords(filter_params['text'])
  #print("keywords",keywords)
  by_keywords_query = ' OR '.join(["txt_t:\""+x+"\"" for x in keywords])

  # search documents by keywords
  print("making the query to the SearchAPI..")
  filter_query = "source_s:"+filter_params['source']
  if (filter_params['source'] == "tender"):
    filter_query += " AND format_s:kg"
  documents_by_keywords = solr.search(by_keywords_query, fq=filter_query, rows=3)  
  topic_levels = ['topics0_t','topics1_t','topics2_t']
  high_filter = []
  medium_filter = []
  for document in documents_by_keywords:
    high_filter.append("("+get_topics(document,topic_levels)+")")
    medium_filter.append("("+get_topics(document,topic_levels[:2])+")")

  topics_query = " OR ".join(high_filter) + " OR " + " OR ".join(medium_filter) 

  target_language="en"
  target_max_size=10
  documents_by_topics = solr.search(topics_query,fq='lang_s:'+filter_params['lang']+" AND source_s:"+filter_params['source'], rows=filter_params['size'])
  return documents_by_topics

def retrieve_documents(filter_params,threshold):
  text = filter_params['text']
  lang = get_language(text)
  nlp = spacy.load(lang)
  doc = nlp(text)
  size = len(doc)
  print("num words:",size)
  if (size < threshold):
    return retrieve_documents_from_short_text(filter_params)
  else:
    return retrieve_documents_from_long_text(filter_params)


## Demo

[Search API](https://github.com/TBFY/search-API) can be used to search for tenders which are related to a procurement text. The language parameter `lang` follows the [ISO 639-1 Code](https://www.iso.org/iso-639-language-codes.html). The service currently supports the following languages: English (`en`), Spanish (`es`), French(`fr`), Italian(`it`) and Portuguese(`pt`).

You can also choose the number of records you want to work with. By default, the notebook will work with 10 records.

In [0]:
#@title Search Documents
import requests
import pandas as pd
from IPython.display import display, HTML
  
text_input =  "Lappalto ha per oggetto la fornitura di apparecchiature conformi alla normativa ECAC Standard 3 con velocita di movimento del nastro interno di almeno 05 msec.  per la rilevazione automatica di esplosivi EDS nei bagagli da stiva dei passeggeri in transito presso gli Aeroporti Milano Linate e Milano Malpensa.  Rientrano altresi nell'oggetto dell'appalto l'installazione e la manutenzione per un periodo di 10 dieci anni a decorrere dalla data di emissione del certificato di verifica di conformita delle apparecchiature fornite"#@param {type:"string"}

max_number = 5 #@param {type:"slider", min:1, max:10, step:1}

language = "es" #@param ["en", "es", "fr", "it", "pt"] {type:"string"}   

doc_type = "tender" #@param ["tender", "jrc"] {type:"string"}

num_words_threshold = 100 #@param {type:"slider", min:50, max:150, step:1}

request = {"source": doc_type, "size": str(max_number), "text" : text_input, "lang": language }

df = pd.DataFrame(columns=['id', 'title'])
i=0
for document in retrieve_documents(request,num_words_threshold):
  df.loc[i] = [document['id'],document['name_s']]
  i+=1

display(HTML(df.to_html(justify='center')))



Once we have the list of related documents, we can directly use their `id` to read the resource in the Knowledge-Graph.

The document identifiers, filtered by source (=tender), in the Search-API are the same as the `id` in the KG-API.

The following procedure extracts the tender list from the obtained list of documents filtering only the completed ones.


In [0]:
#@title Search Tenders

def retrieve_tenders(request):
  documents = retrieve_documents(request,100)
  print("found",len(documents),"related tenders ..")
  tenders = []
  for document in documents:
    id = document['id']
    tender = requests.get('https://tbfy.librairy.linkeddata.es/kg-api/tender/'+id).json()    
    if (request['complete']):
      if tender['status'] == "complete":
        tenders.append(tender)
    else:
      tenders.append(tender)
  return tenders

text_input =  "Lappalto ha per oggetto la fornitura di apparecchiature conformi alla normativa ECAC Standard 3 con velocita di movimento del nastro interno di almeno 05 msec.  per la rilevazione automatica di esplosivi EDS nei bagagli da stiva dei passeggeri in transito presso gli Aeroporti Milano Linate e Milano Malpensa.  Rientrano altresi nell'oggetto dell'appalto l'installazione e la manutenzione per un periodo di 10 dieci anni a decorrere dalla data di emissione del certificato di verifica di conformita delle apparecchiature fornite"#@param {type:"string"}

max_number = 5 #@param {type:"slider", min:1, max:10, step:1}

completed = True #@param {type:"boolean"}

language = "es" #@param ["en", "es", "fr", "it", "pt"] {type:"string"}   

request = {"source": "tender", "complete":completed, "size": str(max_number), "text" : text_input, "lang": language }

df = pd.DataFrame(columns=['id', 'status', 'name', 'description'])
i=0
for tender in retrieve_tenders(request):
  df.loc[i] = [tender['id'],tender['status'],tender['title'],tender['description']]
  i+=1

display(HTML(df.to_html(justify='center')))

Once we have the list of similar tenders. In a first step we will obtain the list of contracting processes related to each tender and its related awards and, finally, the list of awaredees for these awards.

Notice that some of them are shown as "No named organisation". This mean that the name and id of the organisation are not included in our database.

In [0]:
#@title Search Organizations

def get_contracting_process(tender_id):
  resp = requests.get('https://tbfy.librairy.linkeddata.es/kg-api/tender/'+tender_id+"/contractingProcess")
  if resp.status_code != 200:
    # This means something went wrong.
    print('GET /tender/contracting_process {}'.format(resp.status_code))
    pass
  return resp.json() 

def get_award(contracting_process_id):
  resp = requests.get('https://tbfy.librairy.linkeddata.es/kg-api/contractingProcess/'+contracting_process_id+"/award")
  if resp.status_code != 200:
    # This means something went wrong.
    print('GET /contracting_process {}'.format(resp.status_code))
    pass
  return resp.json() 

def get_supplier(award_id):
  resp = requests.get('https://tbfy.librairy.linkeddata.es/kg-api/award/'+award_id+"/supplier")
  if resp.status_code != 200:
    # This means something went wrong.
    print('GET /contracting_process {}'.format(resp.status_code))
    pass
  return resp.json() 


def retrieve_organizations(request):
  organizations = []
  tenders = retrieve_tenders(request)
  for tender in tenders:
    contracts = get_contracting_process(tender['id'])
    if (len(contracts) == 0):
      organization={}
      organization['tender'] = tender['title']
      organizations.append(organization)
    else:
      for contracting_process in contracts:
        awards = get_award(contracting_process['id'])
        if (len(awards) == 0):
          organization={}
          organization['tender'] = tender['title']
          organization['contract'] = contracting_process['releasePublishedDate']
          organizations.append(organization)
        else:
          for award in awards:
            suppliers = get_supplier(award['id'])
            if (len(suppliers) == 0):
              organization={}
              organization['tender'] = tender['title']
              organization['contract'] = contracting_process['releasePublishedDate']
              organization['award'] = award['title']
              organizations.append(organization)
            else: 
              for supplier in suppliers:
                  organization={}
                  organization['tender'] = tender['title']
                  organization['contract'] = contracting_process['releasePublishedDate']
                  organization['award'] = award['title']
                  try:
                    organization['organization']=supplier['legalName']                    
                  except:
                    organization['organization']=supplier['id']              
                  organizations.append(organization)
      print("contracts and awards successfully reviewed for tender",len(organizations))
  return organizations



text_input =  "Lappalto ha per oggetto la fornitura di apparecchiature conformi alla normativa ECAC Standard 3 con velocita di movimento del nastro interno di almeno 05 msec.  per la rilevazione automatica di esplosivi EDS nei bagagli da stiva dei passeggeri in transito presso gli Aeroporti Milano Linate e Milano Malpensa.  Rientrano altresi nell'oggetto dell'appalto l'installazione e la manutenzione per un periodo di 10 dieci anni a decorrere dalla data di emissione del certificato di verifica di conformita delle apparecchiature fornite"#@param {type:"string"}

max_number = 10 #@param {type:"slider", min:1, max:10, step:1}

language = "es" #@param ["en", "es", "fr", "it", "pt"] {type:"string"}   

request = {"source": "tender", "complete":True, "size": str(max_number), "text" : text_input, "lang": language }

labels = ['organization', 'award', 'contract','tender']
df = pd.DataFrame(columns=labels)
i=0
for organization in retrieve_organizations(request):
  row = []
  for label in labels:
    if (label in organization):
      row.append(organization[label])
    else:
      row.append('unknown')
  df.loc[i] = row
  i+=1

display(HTML(df.to_html(justify='center')))
