In [2]:
terrorism_search_queries = ["Al-Qaeda",
                          "terrorism",
                          "terror",
                          "attack",
                          "iraq",
                          "afghanistan",
                          "iran",
                          "Pakistan",
                          "Agro",
                          "Environmental Terrorism",
                          "Eco-Terrorism",
                          "Conventional Weapon",
                          "Weapons Grade",
                          "Dirty Bomb",
                          "Nuclear Enrichment",
                          "Nuclear",
                          "Chemical Weapon",
                          "Biological Weapon",
                          "Ammonium nitrate",
                          "Improvised Explosive Device",
                          "Abu Sayyaf",
                          "Hamas",
                          "FARC",
                          "Irish Republican Army",
                          "Euskadi ta Askatasuna ",
                          "Hezbollah",
                          "Tamil Tigers",
                          "PLO",
                          "Palestine Liberation Front",
                          "Car bomb",
                          "Jihad",
                          "Taliban",
                          "Suicide bomber",
                          "Suicide attack",
                          "AL Qaeda in the Arabian Peninsula",
                          "Al Qaeda in the Islamic Maghreb",
                          "Tehrik-i-Taliban Pakistan",
                          "Yemen",
                          "Pirates",
                          "Extremism",
                          "Somalia",
                          "Nigeria",
                          "Political radicalism",
                          "Al-Shabaab",
                          "Nationalism",
                          "Recruitment",
                          "Fundamentalism",
                          "Islamist"]

top_30_terrorism_search_queries = ["Al Qaeda",
                          "Terrorism",
                          "Terror",
                          "Environmental Terrorism",
                          "Eco-terrorism",
                          "Conventional Weapon",
                          "Weapons Grade",
                          "Dirty Bomb",
                          "Nuclear Enrichment",
                          "Nuclear",
                          "Chemical Weapon",
                          "Biological Weapon",
                          "Improvised Explosive Device",
                          "Abu Sayyaf",
                          "FARC",
                          "Euskadi ta Askatasuna ",
                          "Hezbollah",
                          "Palestine Liberation Front",
                          "Car bomb",
                          "Jihad",
                          "Taliban",
                          "Suicide bomber",
                          "Suicide attack",
                          "AL Qaeda in the Arabian Peninsula",
                          "Al Qaeda in the Islamic Maghreb",
                          "Tehrik-i-Taliban Pakistan",
                          "Political radicalism",
                          "Al-Shabaab",
                          "Recruitment",
                          "Islamist"]

domestic_search_queries = ["Department of Homeland Security",
                          "Federal Emergency Management Agency",
                          "Coast Guard",
                          "Customs and Border Protection ",
                          "Border patrol",
                          "Secret Service",
                          "Bureau of Land Management",
                          "Homeland defense",
                          "Espionage",
                          "Task Force 88",
                          "Central Intelligence Agency",
                          "Fusion center",
                          "DEA",
                          "Secure Border Initiative ",
                          "Federal Bureau of Investigation",
                          "Alcohol and Tobacco Tax and Trade Bureau",
                          "U.S. Citizenship and Immigration Services",
                          "Federal Air Marshal Service ",
                          "Transportation Security Administration",
                          "Air Marshal",
                          "Federal Aviation Administration",
                          "National Guard ",
                          "Emergency Management",
                          "U.S. Immigration and Customs Enforcement",
                          "United Nations"]

# Search Terms

In this notebook we will search for the google trends topics for each DHS terms for both terrorism and domestic categories

In [3]:
import requests
import pandas as pd
import numpy as np

In [4]:
# Directory to save/read data
data_path = "data/"

# Read API_key (you need your own key for it to work)
with open(data_path+"API_key","r") as f:
    API_key = f.read()

In [5]:
base_request_prefix = "https://kgsearch.googleapis.com/v1/entities:search"

### Create simple request to test functionality

In [6]:

params = {
    "query" : "iraq",
    "key"   : API_key,
    "limit" : 10,  
    "indent": True
}
r = requests.get(base_request_prefix, params = params)
entity = r.json()
print(r.url)

https://kgsearch.googleapis.com/v1/entities:search?query=iraq&key=AIzaSyBM4RZcG5Jfel949oWndgzq_rIq-4PI_N8&limit=10&indent=True


In [7]:
entity

{'@context': {'goog': 'http://schema.googleapis.com/',
  'detailedDescription': 'goog:detailedDescription',
  'EntitySearchResult': 'goog:EntitySearchResult',
  '@vocab': 'http://schema.org/',
  'kg': 'http://g.co/kg',
  'resultScore': 'goog:resultScore'},
 '@type': 'ItemList',
 'itemListElement': [{'result': {'@type': ['Place',
     'Country',
     'DefenceEstablishment',
     'AdministrativeArea',
     'Thing'],
    'description': 'Country in the Middle East',
    'name': 'Iraq',
    '@id': 'kg:/m/0d05q4',
    'detailedDescription': {'articleBody': 'Iraq, officially the Republic of Iraq, is a country in Western Asia, bordered by Turkey to the north, Iran to the east, Kuwait to the southeast, Saudi Arabia to the south, Jordan to the southwest and Syria to the west. ',
     'license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License',
     'url': 'https://en.wikipedia.org/wiki/Iraq'}},
   '@type': 'EntitySearchResult',
   're

### Result explanation

The response JSON gives a maximum of 10 corresponding entities. They are ordered by "resultScore: An indicator of how well the entity matched the request constraints". Therefore, we choose the first one which has the highest resultScore and we can recover "@id: The canonical URI for the entity". Then, we can select the correct identifier by spliting the result on ":" and taking the second element.

Source: https://developers.google.com/knowledge-graph/reference/rest/v1


In [8]:
print(entity["itemListElement"][0]["result"]['@id'].split(":")[1])

/m/0d05q4


### Create mapping from search query to Google Knowledge Graph Search Node

In [9]:
def create_search_terms_to_GKG_node_df(search_terms, domain_name, API_key):
    """Creates a dataframe mapping a list of search terms to their GKG node representation.
    We make the assumption that the first google result corresponds to the correct topic representation
    (has the highest resultScore).
    This assumption should hold for our project as the terms are not ambuiguous.
    The returned dataframe has attributes:
    {search_term, node_equivalent, domain_name}

    Args:
        search_terms (list[string]): List of all required search queries to map
        domain_name (str): domain of the search queries (ie. terrorims, domestic, top_30_terrorism)
        API_key (str): API key to use for the requests

    Returns:
        dataframe: dataframe mapping search term to their GKG node representation

    """

    # Create function which uses the GKG Api to find the corresponding node for one search query
    def map_search_term(search_term, API_key):
        """Maps one search query to the first google result search node representation"""
        
        # Create base request url
        base_request_prefix = "https://kgsearch.googleapis.com/v1/entities:search"

        # Create parameters for the request
        params = {
            "query" : search_term,
            "key"   : API_key,
            "limit" : 10, # min is 10 but we care only about the first search
            "indent": True
        }
        r = requests.get(base_request_prefix, params = params)
        
        # Find the Google Knowledge Graph Search entity id
        entity_id = r.json()["itemListElement"][0]["result"]['@id'].split(":")[1]
        # Find the Google Knowledge Graph Search entity name
        entity_name = r.json()["itemListElement"][0]["result"]['name']
        
        return (entity_id, entity_name)
    
    # Map search terms to entity id and name equivalent
    entities = [map_search_term(search_term, API_key) for search_term in search_terms]
    
    # Extract entity ids
    entity_ids = [entity[0] for entity in entities]
    
    # Extract entity names
    entity_names = [entity[1] for entity in entities]
    
    # Create domain column
    domain_column = [domain_name]*len(entities)
    
    # Create dictionary for the df
    data={
        "search_term" : search_terms,
        "entity_id"   : entity_ids,
        "entity_name" : entity_names,
        "domain_name" : domain_column
    }
    
    # return the mapping df
    return pd.DataFrame(data=data)
    
    

### Terrorism dataset Mapping

In [10]:
terrorism_mapping_df = create_search_terms_to_GKG_node_df(terrorism_search_queries, "terrorism", API_key)
# Identifier for Google Knowledge Graph API (starting with "/g/") IDs starting with "/m/" are Freebase IDs


In [11]:
terrorism_mapping_df

Unnamed: 0,search_term,entity_id,entity_name,domain_name
0,Al-Qaeda,/m/0v74,Al-Qaeda,terrorism
1,terrorism,/m/07jq_,Terrorism,terrorism
2,terror,/m/07jq_,Terrorism,terrorism
3,attack,/m/0gtxdb2,Attack on Titan,terrorism
4,iraq,/m/0d05q4,Iraq,terrorism
5,afghanistan,/m/0jdd,Afghanistan,terrorism
6,iran,/m/03shp,Iran,terrorism
7,Pakistan,/m/05sb1,Pakistan,terrorism
8,Agro,/m/019jkv,Agronomy,terrorism
9,Environmental Terrorism,/m/02w1mcd,Environmental terrorism,terrorism


### Observation

We spot two problems when we use the highest resultScore directly. 

1. Some search terms might point to the same entity (ex: terror and terrorism both point to the "Terrorism" entity)
2. Some search terms point to a topic that does not correspond to the "Terrorism" domain (ex: attack points to "Attack on Titan" entity)

### Solutions

The first problem can be solved by using only one of the duplicated enties. This should not lead to any problems since the topic itself encompasses both search terms.

For the second problem we can chose to either delete it, individually look up the corresponding topic by using google trends directly or individually analyse the 10 returned entities to find the closest to the one we are looking for.

### Decision

Looking up the corresponding topic on google trends individually for each missguided topic will lead to the closest topic to the search term we require. Indeed, on google trends when selecting a specific search term the related topics are offered. If no topic are found for this entity we will drop it otherwise we will replace the entry by the one provided by google trends.

### Problematic entries and replacement

3	attack:	/m/0gtxdb2	Attack on Titan	=> /g/11bc5q9v7r Attack 

8	Agro:	/m/019jkv	Agronomy => No terrorism topics

14  Nuclear Enrichment:	/g/11k69f5spb	International Conference on Nuclear Fuel Cycle...=> No topics

15  Nuclear:	/m/04xkp	Magnetic resonance imaging => /m/05gpf Nuclear weapon

17  Biological Weapon:	/m/01xmw0	Biological Weapons Convention => /g/121x751y biological weapon

22	FARC:	/m/011ys5	Farce => /m/06hvg Revolutionary Armed Forces of Colombia—People's Army 

32  Suicide bomber:	/g/11cr_hd3g5	Female suicide bomber => No suicide bomber topic

38  Pirates:	/m/01vksx	Pirates of the Caribbean: The Curse of the Bla... => No terrorism topics


In [15]:
def fix_topics(dataframe, id_replacement, name_replacement, topics_id_to_delete):
    """Fixes topic errors in the dataframe due to ambiguous seach terms, duplicates and non-existing topics.   

    Args:
        dataframe (dataframe): dataframe to fix topic. Requires "entity_name" and "entity_id" columns
        id_replacement (dict): dictionary mapping erronous entity id to correct id
        name_replacement (dict): dictionary mapping erronous entity name to correct name
        topics_to_delete (list(str)): List of all topics ids for which to delete the rows
    Returns:
        dataframe: dataframe with the erronous entries fixed
    """

    # Change required entries and leave the rest
    dataframe["entity_id"] = dataframe["entity_id"] \
                                    .apply(lambda x: id_replacement.get(x) if id_replacement.get(x) is not None else x)
    
    dataframe["entity_name"] = dataframe["entity_name"] \
                                    .apply(lambda x: name_replacement.get(x) if name_replacement.get(x) is not None else x)
    
    # Remove duplicate topics
    dataframe = dataframe.drop_duplicates(subset = ["entity_id", "entity_name"])
    
    # Remove topics that have no corresponding google trends topics
    mask = dataframe["entity_id"].apply(lambda x: x not in topics_id_to_delete)
    
    return dataframe[mask]


In [16]:
# Create id replacement dictionary
id_replacement = {
    "/m/0gtxdb2" : "/g/11bc5q9v7r",
    "/m/04xkp"   : "/m/05gpf",
    "/m/01xmw0"  : "/g/121x751y",
    "/m/011ys5"  : "/m/06hvg"
}

# Create name replacement dictionary
name_replacement = {
    "Attack on Titan" : "Attack",
    "Magnetic resonance imaging" : "Nuclear weapon",
    "Biological Weapons Convention" : "biological weapon",
    "Farce" : "Revolutionary Armed Forces of Colombia—People's Army"
}

# Create list of topics ids to delete
topics_id_to_delete = [
    "/g/11k69f5spb",
    "/g/11cr_hd3g5",
    "/m/01vksx"
]

In [17]:
# Fix ambiguous topics
terrorism_mapping_df = fix_topics(terrorism_mapping_df, id_replacement, name_replacement, topics_id_to_delete)

In [20]:
terrorism_mapping_df

Unnamed: 0,search_term,entity_id,entity_name,domain_name
0,Al-Qaeda,/m/0v74,Al-Qaeda,terrorism
1,terrorism,/m/07jq_,Terrorism,terrorism
3,attack,/g/11bc5q9v7r,Attack,terrorism
4,iraq,/m/0d05q4,Iraq,terrorism
5,afghanistan,/m/0jdd,Afghanistan,terrorism
6,iran,/m/03shp,Iran,terrorism
7,Pakistan,/m/05sb1,Pakistan,terrorism
8,Agro,/m/019jkv,Agronomy,terrorism
9,Environmental Terrorism,/m/02w1mcd,Environmental terrorism,terrorism
10,Eco-Terrorism,/m/01s60b,Eco-terrorism,terrorism


### Domestic dataset Mapping

In [21]:
domestic_mapping_df = create_search_terms_to_GKG_node_df(domestic_search_queries, "domestic", API_key)

In [36]:
domestic_mapping_df

Unnamed: 0,search_term,entity_id,entity_name,domain_name
0,Department of Homeland Security,/m/0fytk,United States Department of Homeland Security,domestic
1,Federal Emergency Management Agency,/m/0js8z,Federal Emergency Management Agency,domestic
2,Coast Guard,/m/07xhy,United States Coast Guard,domestic
3,Customs and Border Protection,/m/038r8p,U.S. Customs and Border Protection,domestic
4,Border patrol,/m/02qtlv,United States Border Patrol,domestic
5,Secret Service,/m/0fynw,United States Secret Service,domestic
6,Bureau of Land Management,/m/0f4r5,Bureau of Land Management,domestic
7,Homeland defense,/g/1210mblq,Homeland defense,domestic
8,Espionage,/m/02vnz,Espionage,domestic
9,Task Force 88,/m/04yd3hg,Joint Special Operations Command Task Force in...,domestic


### Observation

There is only one probleme here:

Secret Service	/m/0y4n5ll	Kingsman: The Secret Service => /m/0fynw United States Secret Service

We can observe that most topics are related to the US but this is expected because we use the DHS (Department of Homeland Security) keywords for domestic security. 

In [34]:
# Create id replacement dictionary
domestic_id_replacement = {
    "/m/0y4n5ll" : "/m/0fynw",
}

# Create name replacement dictionary
domestic_name_replacement = {
    "Kingsman: The Secret Service" : "United States Secret Service"
}

In [37]:
# Fix ambiguous topics
domestic_mapping_df = fix_topics(domestic_mapping_df, domestic_id_replacement, domestic_name_replacement, [])

### Top-30 terrorism dataset Mapping

In [38]:
top_30_terrorism_mapping_df = create_search_terms_to_GKG_node_df(top_30_terrorism_search_queries, "top_30_terrorism", API_key)

In [41]:
top_30_terrorism_mapping_df

Unnamed: 0,search_term,entity_id,entity_name,domain_name
0,Al Qaeda,/m/0v74,Al-Qaeda,top_30_terrorism
1,Terrorism,/m/07jq_,Terrorism,top_30_terrorism
3,Environmental Terrorism,/m/02w1mcd,Environmental terrorism,top_30_terrorism
4,Eco-terrorism,/m/01s60b,Eco-terrorism,top_30_terrorism
5,Conventional Weapon,/m/02t93h,Conventional weapon,top_30_terrorism
6,Weapons Grade,/m/0blqm8,Weapons-grade nuclear material,top_30_terrorism
7,Dirty Bomb,/m/0fjf_,Dirty bomb,top_30_terrorism
9,Nuclear,/m/05gpf,Nuclear weapon,top_30_terrorism
10,Chemical Weapon,/m/0bwkbcx,Chemical weapon,top_30_terrorism
11,Biological Weapon,/g/121x751y,biological weapon,top_30_terrorism


### Observation

Since the original search are a subset from the terrorism domain we need to fix the same problems.

In [40]:
# Fix ambiguous topics
top_30_terrorism_mapping_df = fix_topics(top_30_terrorism_mapping_df, id_replacement, name_replacement, topics_id_to_delete)

### Create a dataframe with all mappings

In [42]:
all_mappings = [terrorism_mapping_df, domestic_mapping_df, top_30_terrorism_mapping_df]
# Concatenate all mappings into one dataframe
all_mappings_df = pd.concat(all_mappings).reset_index().drop(columns=['index'])
all_mappings_df

Unnamed: 0,search_term,entity_id,entity_name,domain_name
0,Al-Qaeda,/m/0v74,Al-Qaeda,terrorism
1,terrorism,/m/07jq_,Terrorism,terrorism
2,attack,/g/11bc5q9v7r,Attack,terrorism
3,iraq,/m/0d05q4,Iraq,terrorism
4,afghanistan,/m/0jdd,Afghanistan,terrorism
...,...,...,...,...
90,Tehrik-i-Taliban Pakistan,/m/04jhbb_,Tehrik-i-Taliban Pakistan,top_30_terrorism
91,Political radicalism,/m/03mhnhh,Radical politics,top_30_terrorism
92,Al-Shabaab,/m/027ggxz,Al-Shabaab,top_30_terrorism
93,Recruitment,/m/03nw7w,Recruitment,top_30_terrorism


In [43]:
# Save dataframe to pickle
all_mappings_df.to_pickle(data_path+"mapping.pkl")