In [1]:
terrorism_search_queries = ["Al-Qaeda",
                          "terrorism",
                          "terror",
                          "attack",
                          "iraq",
                          "afghanistan",
                          "iran",
                          "Pakistan",
                          "Agro",
                          "Environmental Terrorism",
                          "Eco-Terrorism",
                          "Conventional Weapon",
                          "Weapons Grade",
                          "Dirty Bomb",
                          "Nuclear Enrichment",
                          "Nuclear",
                          "Chemical Weapon",
                          "Biological Weapon",
                          "Ammonium nitrate",
                          "Improvised Explosive Device",
                          "Abu Sayyaf",
                          "Hamas",
                          "FARC",
                          "Irish Republican Army",
                          "Euskadi ta Askatasuna ",
                          "Hezbollah",
                          "Tamil Tigers",
                          "PLO",
                          "Palestine Liberation Front",
                          "Car bomb",
                          "Jihad",
                          "Taliban",
                          "Suicide bomber",
                          "Suicide attack",
                          "AL Qaeda in the Arabian Peninsula",
                          "Al Qaeda in the Islamic Maghreb",
                          "Tehrik-i-Taliban Pakistan",
                          "Yemen",
                          "Pirates",
                          "Extremism",
                          "Somalia",
                          "Nigeria",
                          "Political radicalism",
                          "Al-Shabaab",
                          "Nationalism",
                          "Recruitment",
                          "Fundamentalism",
                          "Islamist"]

top_30_terrorism_search_queries = ["Al Qaeda",
                          "Terrorism",
                          "Terror",
                          "Environmental Terrorism",
                          "Eco-terrorism",
                          "Conventional Weapon",
                          "Weapons Grade",
                          "Dirty Bomb",
                          "Nuclear Enrichment",
                          "Nuclear",
                          "Chemical Weapon",
                          "Biological Weapon",
                          "Improvised Explosive Device",
                          "Abu Sayyaf",
                          "FARC",
                          "Euskadi ta Askatasuna ",
                          "Hezbollah",
                          "Palestine Liberation Front",
                          "Car bomb",
                          "Jihad",
                          "Taliban",
                          "Suicide bomber",
                          "Suicide attack",
                          "AL Qaeda in the Arabian Peninsula",
                          "Al Qaeda in the Islamic Maghreb",
                          "Tehrik-i-Taliban Pakistan",
                          "Political radicalism",
                          "Al-Shabaab",
                          "Recruitment",
                          "Islamist"]

domestic_search_queries = ["Department of Homeland Security",
                          "Federal Emergency Management Agency",
                          "Coast Guard",
                          "Customs and Border Protection ",
                          "Border patrol",
                          "Secret Service",
                          "Bureau of Land Management",
                          "Homeland defense",
                          "Espionage",
                          "Task Force 88",
                          "Central Intelligence Agency",
                          "Fusion center",
                          "DEA",
                          "Secure Border Initiative ",
                          "Federal Bureau of Investigation", #TODO: on remplace par FBI?
                          "Alcohol and Tobacco Tax and Trade Bureau", #TODO: IDEM y a peutetre un acronyme
                          "U.S. Citizenship and Immigration Services",
                          "Federal Air Marshal Service ",
                          "Transportation Security Administration",
                          "Air Marshal",
                          "Federal Aviation Administration",
                          "National Guard ",
                          "Emergency Management", #TODO: separe?
                          "U.S. Immigration and Customs Enforcement",
                          "United Nations"]

# Search Terms

In this notebook we will search for the firebase term for each of our search original search queries.

In [2]:
import requests
import pandas as pd
import numpy as np

In [3]:
# Directory to save/read data
data_path = "data/"

# Read API_key (you need your own key for it to work)
with open(data_path+"API_key","r") as f:
    API_key = f.read()

In [4]:
base_request_prefix = "https://kgsearch.googleapis.com/v1/entities:search"

# https://kgsearch.googleapis.com/v1/entities:search?query=taylor+swift&key=API_KEY&limit=1&indent=True

### Create simple request to test functionality

In [5]:

params = {
    "query" : "iraq", #FBI
    "key"   : API_key,
    "limit" : 10,  
    "indent": True
}
r = requests.get(base_request_prefix, params = params)
entity = r.json()
print(r.url)

https://kgsearch.googleapis.com/v1/entities:search?query=iraq&key=AIzaSyBM4RZcG5Jfel949oWndgzq_rIq-4PI_N8&limit=10&indent=True


In [6]:
entity

{'@context': {'resultScore': 'goog:resultScore',
  '@vocab': 'http://schema.org/',
  'kg': 'http://g.co/kg',
  'goog': 'http://schema.googleapis.com/',
  'detailedDescription': 'goog:detailedDescription',
  'EntitySearchResult': 'goog:EntitySearchResult'},
 '@type': 'ItemList',
 'itemListElement': [{'result': {'@type': ['DefenceEstablishment',
     'AdministrativeArea',
     'Thing',
     'Place',
     'Country'],
    'description': 'Country in the Middle East',
    'detailedDescription': {'articleBody': 'Iraq, officially the Republic of Iraq, is a country in Western Asia, bordered by Turkey to the north, Iran to the east, Kuwait to the southeast, Saudi Arabia to the south, Jordan to the southwest and Syria to the west. ',
     'license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',
     'url': 'https://en.wikipedia.org/wiki/Iraq'},
    'name': 'Iraq',
    '@id': 'kg:/m/0d05q4'},
   'resultScore': 14626.021484375,
   '@

### Result explanation

The response JSON gives a maximum of 10 corresponding entities. They are ordered by "resultScore: An indicator of how well the entity matched the request constraints". Therefore, we choose the first one which has the highest resultScore and we can recover "@id: The canonical URI for the entity". Then, we can select the correct identifier by spliting the result on ":" and taking the second element.

Source: https://developers.google.com/knowledge-graph/reference/rest/v1


In [7]:
print(entity["itemListElement"][0]["result"]['@id'].split(":")[1])

/m/0d05q4


### Create mapping from search query to Google Knowledge Graph Search Node

In [8]:
def create_search_terms_to_GKG_node_df(search_terms, domain_name, API_key):
    """Creates a dataframe mapping a list of search terms to their GKG node representation.
    We make the assumption that the first google result corresponds to the correct topic representation
    (has the highest resultScore).
    This assumption should hold for our project as the terms are not ambuiguous.
    The returned dataframe has attributes:
    {search_term, node_equivalent, domain_name}

    Args:
        search_terms (list[string]): List of all required search queries to map
        domain_name (str): domain of the search queries (ie. Terrorims, Domestic, top-30 Terrorism )
        API_key (str): API key to use for the requests

    Returns:
        dataframe: dataframe mapping search term to their GKG node representation

    """

    # Create function which uses the GKG Api to find the corresponding node for one search query
    def map_search_term(search_term, API_key):
        """Maps one search query to the first google result search node representation"""
        
        # Create base request url
        base_request_prefix = "https://kgsearch.googleapis.com/v1/entities:search"

        # Create parameters for the request
        params = {
            "query" : search_term,
            "key"   : API_key,
            "limit" : 10, # min is 10 but we care only about the first search
            "indent": True
        }
        r = requests.get(base_request_prefix, params = params)
        
        # Find the Google Knowledge Graph Search Node name
        return r.json()["itemListElement"][0]["result"]['@id'].split(":")[1]
    
    # Map search terms to node equivalent
    node_equivalents = [map_search_term(search_term, API_key) for search_term in search_terms]
    
    # Create domain column
    domain_column = [domain_name]*len(node_equivalents)
    
    # Create dictionary for the df
    data={
        "search_term" : search_terms,
        "node_equivalent" : node_equivalents,
        "domain_name" : domain_column
    }
    
    # return the mapping df
    return pd.DataFrame(data=data)
    
    

In [9]:
def clean_non_firebase_nodes_ids(dataframe):
    """Clean the database of any tupples with a non firebase id (starting with "/m/"). 
    These are identifier for Google Knowledge Graph API (starting with "/g/")
    which cannot be used for google trend requests.

    Args:
        dataframe (dataframe): panda dataframe to clean with a "node_equivalent" column

    Returns:
        dataframe: dataframe cleaned of any tupple with a non-firebase id
    """
    return dataframe[dataframe["node_equivalent"].apply(lambda x: x[:3]) == "/m/"]

### Terrorism dataset Mapping

In [10]:
terrorism_mapping_df = create_search_terms_to_GKG_node_df(terrorism_search_queries, "terrorism", API_key)
# Identifier for Google Knowledge Graph API (starting with "/g/") IDs starting with "/m/" are Freebase IDs
terrorism_mapping_df

Unnamed: 0,search_term,node_equivalent,domain_name
0,Al-Qaeda,/m/0v74,terrorism
1,terrorism,/m/07jq_,terrorism
2,terror,/m/07jq_,terrorism
3,attack,/m/0gtxdb2,terrorism
4,iraq,/m/0d05q4,terrorism
5,afghanistan,/m/0jdd,terrorism
6,iran,/m/03shp,terrorism
7,Pakistan,/m/05sb1,terrorism
8,Agro,/m/019jkv,terrorism
9,Environmental Terrorism,/m/02w1mcd,terrorism


In [11]:
# These can not be used for google trends queries so we will remove any entries like them
terrorism_mapping_df = clean_non_firebase_nodes_ids(terrorism_mapping_df)

### Domestic dataset Mapping

In [12]:
domestic_mapping_df = create_search_terms_to_GKG_node_df(domestic_search_queries, "domestic", API_key)
domestic_mapping_df

Unnamed: 0,search_term,node_equivalent,domain_name
0,Department of Homeland Security,/m/0fytk,domestic
1,Federal Emergency Management Agency,/m/0js8z,domestic
2,Coast Guard,/m/07xhy,domestic
3,Customs and Border Protection,/m/038r8p,domestic
4,Border patrol,/m/02qtlv,domestic
5,Secret Service,/m/0y4n5ll,domestic
6,Bureau of Land Management,/m/0f4r5,domestic
7,Homeland defense,/g/1210mblq,domestic
8,Espionage,/m/02vnz,domestic
9,Task Force 88,/m/04yd3hg,domestic


In [13]:
domestic_mapping_df = clean_non_firebase_nodes_ids(domestic_mapping_df)
domestic_mapping_df

Unnamed: 0,search_term,node_equivalent,domain_name
0,Department of Homeland Security,/m/0fytk,domestic
1,Federal Emergency Management Agency,/m/0js8z,domestic
2,Coast Guard,/m/07xhy,domestic
3,Customs and Border Protection,/m/038r8p,domestic
4,Border patrol,/m/02qtlv,domestic
5,Secret Service,/m/0y4n5ll,domestic
6,Bureau of Land Management,/m/0f4r5,domestic
8,Espionage,/m/02vnz,domestic
9,Task Force 88,/m/04yd3hg,domestic
10,Central Intelligence Agency,/m/0d6qjf,domestic


### Top-30 terrorism dataset Mapping

In [14]:
top_30_terrorism_mapping_df = create_search_terms_to_GKG_node_df(top_30_terrorism_search_queries, "top_30_terrorism", API_key)
top_30_terrorism_mapping_df

Unnamed: 0,search_term,node_equivalent,domain_name
0,Al Qaeda,/m/0v74,top_30_terrorism
1,Terrorism,/m/07jq_,top_30_terrorism
2,Terror,/m/07jq_,top_30_terrorism
3,Environmental Terrorism,/m/02w1mcd,top_30_terrorism
4,Eco-terrorism,/m/01s60b,top_30_terrorism
5,Conventional Weapon,/m/02t93h,top_30_terrorism
6,Weapons Grade,/m/0blqm8,top_30_terrorism
7,Dirty Bomb,/m/0fjf_,top_30_terrorism
8,Nuclear Enrichment,/g/11k69f5spb,top_30_terrorism
9,Nuclear,/m/04xkp,top_30_terrorism


In [15]:
top_30_terrorism_mapping_df = clean_non_firebase_nodes_ids(top_30_terrorism_mapping_df)
top_30_terrorism_mapping_df

Unnamed: 0,search_term,node_equivalent,domain_name
0,Al Qaeda,/m/0v74,top_30_terrorism
1,Terrorism,/m/07jq_,top_30_terrorism
2,Terror,/m/07jq_,top_30_terrorism
3,Environmental Terrorism,/m/02w1mcd,top_30_terrorism
4,Eco-terrorism,/m/01s60b,top_30_terrorism
5,Conventional Weapon,/m/02t93h,top_30_terrorism
6,Weapons Grade,/m/0blqm8,top_30_terrorism
7,Dirty Bomb,/m/0fjf_,top_30_terrorism
9,Nuclear,/m/04xkp,top_30_terrorism
10,Chemical Weapon,/m/0bwkbcx,top_30_terrorism


### Create a dataframe with all mappings

In [17]:
all_mappings = [terrorism_mapping_df, domestic_mapping_df, top_30_terrorism_mapping_df]
# Concatenate all mappings into one dataframe
all_mappings_df = pd.concat(all_mappings).reset_index().drop(columns=['index'])
all_mappings_df

Unnamed: 0,search_term,node_equivalent,domain_name
0,Al-Qaeda,/m/0v74,terrorism
1,terrorism,/m/07jq_,terrorism
2,terror,/m/07jq_,terrorism
3,attack,/m/0gtxdb2,terrorism
4,iraq,/m/0d05q4,terrorism
...,...,...,...
93,Tehrik-i-Taliban Pakistan,/m/04jhbb_,top_30_terrorism
94,Political radicalism,/m/03mhnhh,top_30_terrorism
95,Al-Shabaab,/m/027ggxz,top_30_terrorism
96,Recruitment,/m/03nw7w,top_30_terrorism


In [18]:
# Save dataframe to pickle
all_mappings_df.to_pickle(data_path+"mapping.pkl")