# Load dataframe with EFSA patterns

In [16]:
import pandas as pd
from dsutils.de.files import get_data_path, get_datafile_path

In [17]:
path_to_json_key = get_datafile_path("vsi-esv-ab828a611479.json")

In [18]:
efsa_path = get_datafile_path("2022-09-06_EFSAmagali_EPPO_ComonNames.xlsx")

with open(efsa_path, 'r') as f:
    df = pd.read_csv(f)

  warn("Workbook contains no default style, apply openpyxl's default")


In [19]:
df.columns

Index(['...1', 'Category (pest name)', 'Keywords', 'Keywords2',
       'ComonNameEPPO', 'LangueEPPO', 'PreferredNameEPPO'],
      dtype='object')

In [20]:
df.head()

Unnamed: 0,...1,Category (pest name),Keywords,Keywords2,ComonNameEPPO,LangueEPPO,PreferredNameEPPO
0,id,AcaloleptaSejuncta-PHT,acalolepta+sejuncta,acalolepta sejuncta,,,
1,id,AcalymmaVittatum-PHT,escarabajo+de+las+cucurbit_ceas,escarabajo de las cucurbit_ceas,,,
2,,AcalymmaVittatum-PHT,acalymma+vittata,acalymma vittata,,,
3,,AcalymmaVittatum-PHT,cistela+melanocephala,cistela melanocephala,,,
4,,AcalymmaVittatum-PHT,crioceris+vittata,crioceris vittata,,,


# Setup search with google cloud

## Setting up Custom search engine

In [21]:
#following
# https://stackoverflow.com/questions/37083058/programmatically-searching-google-in-python-using-custom-search

In [22]:
# after creating a programmable search engine in 
# http://www.google.com/cse/
# this are the ids:

cse_id = "014ace1401808423a"
cse_url = "https://cse.google.com/cse?cx=014ace1401808423a"

## Loading Google Cloud credentials

In [23]:
from google.oauth2 import service_account
credentials = service_account.Credentials.from_service_account_file(
    path_to_json_key,
    scopes = ['https://www.googleapis.com/auth/webmasters'],
)

## Get a google API key

In [24]:
# A google API key is a string that allows you to use google cloud for searching the web.
# Get a google search API key here:
# https://developers.google.com/custom-search/v1/overview

# Scroll down to the "Get a key" button
# select your google project
# Agree to terms and conditions
# Copy the key

## Test search

In [25]:
from googleapiclient.discovery import build
import pprint

my_api_key =  "AIzaSyD-39P07k4yu6qVMe1CegDLmGNRU7pnw8c"  # the google search api key I got before
my_cse_id = cse_id

def test_google_search(search_term, api_key, cse_id, **kwargs):
    service = build("customsearch", "v1", developerKey=api_key)
    res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
    return res['items']


results = test_google_search(
    'stackoverflow', 
    my_api_key, 
    my_cse_id, 
    num=10, # this function can return at most 10 google search results!
    #lr = "es", #specify the language
    hl = "es",
    gl = "pe", #specify the location
    )
print(len(results))
print(type(results))
[print(type(result)) for result in results]
[print(list(result.keys())) for result in results]
for result in results:
    pprint.pprint(result['link'])
    pprint.pprint(result['snippet'])


HttpError: <HttpError 429 when requesting https://customsearch.googleapis.com/customsearch/v1?q=stackoverflow&cx=014ace1401808423a&num=10&hl=es&gl=pe&key=AIzaSyD-39P07k4yu6qVMe1CegDLmGNRU7pnw8c&alt=json returned "Quota exceeded for quota metric 'Queries' and limit 'Queries per day' of service 'customsearch.googleapis.com' for consumer 'project_number:1080937495484'.". Details: "[{'message': "Quota exceeded for quota metric 'Queries' and limit 'Queries per day' of service 'customsearch.googleapis.com' for consumer 'project_number:1080937495484'.", 'domain': 'global', 'reason': 'rateLimitExceeded'}]">

## List languages and countries



In [None]:
# languages supported by google search
# https://developers.google.com/custom-search/docs/json_api_reference#supported-interface-languages

# !!!!
# note that even when you specify a language, the results may contain links to pages in different languages!

In [None]:
# locations supported by google search
# https://developers.google.com/custom-search/docs/json_api_reference#country-codes

In [None]:
df_languages = pd.read_csv("google_search_languages.csv")
df_languages.head()

Unnamed: 0,display_language,hl_value
0,Afrikaans,af
1,Albanian,sq
2,Amharic,sm
3,Arabic,ar
4,Azerbaijani,az


In [None]:
df_locations = pd.read_csv("google_search_locations.csv")
df_locations.head()

Unnamed: 0,country,country_code
0,Afghanistan,af
1,Albania,al
2,Algeria,dz
3,American Samoa,as
4,Andorra,ad


In [None]:
google_search_languages = df_languages['hl_value']
google_search_locations = df_locations['country_code']

# Search strategy

- Build a dataset of EFSA pattern and google search results. 
- Each row has an EFSA pattern and a link where it appears (according to Google Search)
- To build the dataset, we scrap over all languages and locations, and ignore duplicate links

- TODO after building the dataset:
 - get the plain text from each link
 - convert the EFSA pattern to regex
 - identify the language of the plain text
 - extract the matches in the plain text

# Search EFSA pattern on google


In [None]:
from googleapiclient.discovery import build
import pprint

my_api_key =  "AIzaSyD-39P07k4yu6qVMe1CegDLmGNRU7pnw8c" 
my_cse_id = cse_id
service = build("customsearch", "v1", developerKey=my_api_key)

def google_search(
    service, 
    search_term, 
    api_key, 
    cse_id, 
    **kwargs
    ):
    
    res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
    return res['items']


def google_search_by_language_and_location(
    service,
    search_term,

):
    result_links = set()
    for language in google_search_languages:
        for location in google_search_locations:

            search_results = google_search(
                service,
                search_term,
                api_key= my_api_key,
                cse_id= my_cse_id,
                num = 10,
            )
            search_results_links = set([result['link'] for result in search_results])

        result_links.update(search_results_links)
    
    return list(result_links)


In [None]:
%%time
# test
print(df['Keywords'][0] )
google_search_by_language_and_location(service,  df['Keywords'][0] )

acalolepta+sejuncta


HttpError: <HttpError 429 when requesting https://customsearch.googleapis.com/customsearch/v1?q=acalolepta%2Bsejuncta&cx=014ace1401808423a&num=10&key=AIzaSyD-39P07k4yu6qVMe1CegDLmGNRU7pnw8c&alt=json returned "Quota exceeded for quota metric 'Queries' and limit 'Queries per day' of service 'customsearch.googleapis.com' for consumer 'project_number:1080937495484'.". Details: "[{'message': "Quota exceeded for quota metric 'Queries' and limit 'Queries per day' of service 'customsearch.googleapis.com' for consumer 'project_number:1080937495484'.", 'domain': 'global', 'reason': 'rateLimitExceeded'}]">

> I exceeded my quota!

> Ask for more from owners of Google Project?