In [1]:
terrorism_search_queries = ["Al-Qaeda",
                          "terrorism",
                          "terror",
                          "attack",
                          "iraq",
                          "afghanistan",
                          "iran",
                          "Pakistan",
                          "Agro",
                          "Environmental Terrorism",
                          "Eco-Terrorism",
                          "Conventional Weapon",
                          "Weapons Grade",
                          "Dirty Bomb",
                          "Nuclear Enrichment",
                          "Nuclear",
                          "Chemical Weapon",
                          "Biological Weapon",
                          "Ammonium nitrate",
                          "Improvised Explosive Device",
                          "Abu Sayyaf",
                          "Hamas",
                          "FARC",
                          "Irish Republican Army",
                          "Euskadi ta Askatasuna ",
                          "Hezbollah",
                          "Tamil Tigers",
                          "PLO",
                          "Palestine Liberation Front",
                          "Car bomb",
                          "Jihad",
                          "Taliban",
                          "Suicide bomber",
                          "Suicide attack",
                          "AL Qaeda in the Arabian Peninsula",
                          "Al Qaeda in the Islamic Maghreb",
                          "Tehrik-i-Taliban Pakistan",
                          "Yemen",
                          "Pirates",
                          "Extremism",
                          "Somalia",
                          "Nigeria",
                          "Political radicalism",
                          "Al-Shabaab",
                          "Nationalism",
                          "Recruitment",
                          "Fundamentalism",
                          "Islamist"]

top_30_terrorism_search_queries = ["Al Qaeda",
                          "Terrorism",
                          "Terror",
                          "Environmental Terrorism",
                          "Eco-terrorism",
                          "Conventional Weapon",
                          "Weapons Grade",
                          "Dirty Bomb",
                          "Nuclear Enrichment",
                          "Nuclear",
                          "Chemical Weapon",
                          "Biological Weapon",
                          "Improvised Explosive Device",
                          "Abu Sayyaf",
                          "FARC",
                          "Euskadi ta Askatasuna ",
                          "Hezbollah",
                          "Palestine Liberation Front",
                          "Car bomb",
                          "Jihad",
                          "Taliban",
                          "Suicide bomber",
                          "Suicide attack",
                          "AL Qaeda in the Arabian Peninsula",
                          "Al Qaeda in the Islamic Maghreb",
                          "Tehrik-i-Taliban Pakistan",
                          "Political radicalism",
                          "Al-Shabaab",
                          "Recruitment",
                          "Islamist"]

domestic_search_queries = ["Department of Homeland Security",
                          "Federal Emergency Management Agency",
                          "Coast Guard",
                          "Customs and Border Protection ",
                          "Border patrol",
                          "Secret Service",
                          "Bureau of Land Management",
                          "Homeland defense",
                          "Espionage",
                          "Task Force 88",
                          "Central Intelligence Agency",
                          "Fusion center",
                          "DEA",
                          "Secure Border Initiative ",
                          "Federal Bureau of Investigation", #TODO: on remplace par FBI?
                          "Alcohol and Tobacco Tax and Trade Bureau", #TODO: IDEM y a peutetre un acronyme
                          "U.S. Citizenship and Immigration Services",
                          "Federal Air Marshal Service ",
                          "Transportation Security Administration",
                          "Air Marshal",
                          "Federal Aviation Administration",
                          "National Guard ",
                          "Emergency Management", #TODO: separe?
                          "U.S. Immigration and Customs Enforcement",
                          "United Nations"]

# Data Wrangling

In this notebook we will create the following datasets:

    - 48 Terrorism related search queries [2012-01-01 -> 2014-08-31]
    - 25 Domestic related search queries [2012-01-01 -> 2014-08-31]
    - top 30 MTurk evaluation terrorism related search queries [2012-01-01 -> 2014-08-31]


### Imports

In [2]:
!pip install gtab
import gtab
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np



In [3]:
# Create directory for gtab data
my_path = "gtab_data"
t = gtab.GTAB(dir_path=my_path)

# Directory to save data
data_path = "data/"

Directory already exists, loading data from it.
Using directory 'gtab_data'
Active anchorbank changed to: google_anchorbank_geo=_timeframe=2019-01-01 2020-08-01.tsv



In [6]:
#TODO: should we include until present time for an extended analysis?
# Create time frame corresponding to the paper study
start_timeframe = "2012-01-01"
end_timeframe   = "2014-08-31" #TODO: si cest borne inclu (ca a l'air detre le cas) sinon =>"2014-09-01"
timeframe = start_timeframe + " " + end_timeframe

# We choose the worldwide geolocalisation to mimic wikipedia
worldwide = "" # empty string corresponds to worldwide

In [7]:
# List all existing anchorbanks
t.list_gtabs()

Existing GTABs:
	google_anchorbank_geo=IT_timeframe=2019-01-01 2020-08-01.tsv
	google_anchorbank_geo=_timeframe=2012-01-01 2014-08-31.tsv
	google_anchorbank_geo=_timeframe=2019-01-01 2020-08-01.tsv
Active anchorbank: google_anchorbank_geo=_timeframe=2019-01-01 2020-08-01.tsv


**As you can see the anchorbank which we require does not exist yet. Therefore we will create it!**

In [9]:
t.set_options(pytrends_config={"geo": worldwide, "timeframe": timeframe})

In [10]:
#t.create_anchorbank() # takes a while to run since it queries Google Trends. # je sais pas si ca rerun et jai pas envie dessayer

In [11]:
# We apply the global anchor and paper timeframe to gtab 
t.set_active_gtab(f"google_anchorbank_geo={worldwide}_timeframe={timeframe}.tsv")

Active anchorbank changed to: google_anchorbank_geo=_timeframe=2012-01-01 2014-08-31.tsv



In [12]:
def create_dataframe(search_queries, geo, t):
    """Creates a dataframe concatenating all search queries interest over time data from google
    trends. The data will have as an added collumn the names of the original search and the 
    geolocalisation of the search query.
    The returned dataframe has attributes:
    {date, max_ratio  max_ratio_hi  max_ratio_l, article_name, geo}

    Args:
        search_queries (list[string]): List of all required search queries for the full dataframe
        geo (str): geolocalisation of the search query
        t (GTAB): GoogleTrendsAnchorBank to use for the queries it needs to be consistent with the geo parameter

    Returns:
        dataframe: dataframe concatenating all search queries interest over time data from google
    trends

    """
    # For each terrorism related search query create the corresponding interest over time google trends data
    all_interest_over_time_dfs = [t.new_query(search_query) for search_query in search_queries]
    
    # Append the name and location to all dataframes
    for i, df in enumerate(all_interest_over_time_dfs):
        # Add the article name collumn 
        df["article_name"] = [search_queries[i]]*len(df)
        # Add the localisation collumn
        df["geo"] = ["worldwide"]*len(df)        
    
    # Concatenate all dfs into one centrale one
    return pd.concat(all_interest_over_time_dfs).reset_index()
    


### Terrorism dataset

48 Terrorism related search queries [2012-01-01 -> 2014-08-31]

In [52]:
# Create the list of all terrorism search queries
#TODO: should we preprocess the queries ie. remove uppercase?

len(terrorism_search_queries)

48

In [53]:
# Create terrorism dataframe
terrorism_df = create_dataframe(terrorism_search_queries, geo, t)

Using gtab_data\output\google_anchorbanks\google_anchorbank_geo=_timeframe=2012-01-01 2014-08-31.tsv
New query 'Al-Qaeda'
New query calibrated!
Using gtab_data\output\google_anchorbanks\google_anchorbank_geo=_timeframe=2012-01-01 2014-08-31.tsv
New query 'terrorism'
New query calibrated!
Using gtab_data\output\google_anchorbanks\google_anchorbank_geo=_timeframe=2012-01-01 2014-08-31.tsv
New query 'terror'
New query calibrated!
Using gtab_data\output\google_anchorbanks\google_anchorbank_geo=_timeframe=2012-01-01 2014-08-31.tsv
New query 'attack'
New query calibrated!
Using gtab_data\output\google_anchorbanks\google_anchorbank_geo=_timeframe=2012-01-01 2014-08-31.tsv
New query 'iraq'
New query calibrated!
Using gtab_data\output\google_anchorbanks\google_anchorbank_geo=_timeframe=2012-01-01 2014-08-31.tsv
New query 'afghanistan'
New query calibrated!
Using gtab_data\output\google_anchorbanks\google_anchorbank_geo=_timeframe=2012-01-01 2014-08-31.tsv
New query 'iran'
New query calibrated!


In [56]:
# View of created df
terrorism_df.head()

Unnamed: 0,date,max_ratio,max_ratio_hi,max_ratio_lo,article_name,geo
0,2012-01-01,0.21,0.215,0.205,Al-Qaeda,worldwide
1,2012-01-08,0.25,0.255,0.245,Al-Qaeda,worldwide
2,2012-01-15,0.25,0.255,0.245,Al-Qaeda,worldwide
3,2012-01-22,0.28,0.285,0.275,Al-Qaeda,worldwide
4,2012-01-29,0.25,0.255,0.245,Al-Qaeda,worldwide


In [57]:
# Save dataframe to pickle
terrorism_df.to_pickle(data_path+"terrorism.pkl")

### Domestic dataset

25 Domestic related search queries [2012-01-01 -> 2014-08-31]

In [50]:
# Create the list of all terrorism search queries
#TODO: should we preprocess the queries ie. remove uppercase?

len(domestic_search_queries)

25

In [None]:
# Create dommestic dataframe
dommestic_df = create_dataframe(domestic_search_queries, geo, t)

In [None]:
# View of created df
dommestic_df.head()

In [None]:
# Save dataframe to pickle
dommestic_df.to_pickle(data_path+"dommestic.pkl")

### Top 30 Terrorism dataset

top 30 MTurk evaluation terrorism related search queries [2012-01-01 -> 2014-08-31]

In [58]:
# Create the list of all terrorism search queries
#TODO: should we preprocess the queries ie. remove uppercase?

len(top_30_terrorism_search_queries)

34

In [59]:

# Create dommestic dataframe
top_30_terrorism_df = create_dataframe(top_30_terrorism_search_queries, geo, t)

Using gtab_data\output\google_anchorbanks\google_anchorbank_geo=_timeframe=2012-01-01 2014-08-31.tsv
New query 'Al Qaeda'
New query calibrated!
Using gtab_data\output\google_anchorbanks\google_anchorbank_geo=_timeframe=2012-01-01 2014-08-31.tsv
New query 'Terrorism'
New query calibrated!
Using gtab_data\output\google_anchorbanks\google_anchorbank_geo=_timeframe=2012-01-01 2014-08-31.tsv
New query 'Terror'
New query calibrated!
Using gtab_data\output\google_anchorbanks\google_anchorbank_geo=_timeframe=2012-01-01 2014-08-31.tsv
New query 'Environmental Terrorism'
New query calibrated!
Using gtab_data\output\google_anchorbanks\google_anchorbank_geo=_timeframe=2012-01-01 2014-08-31.tsv
New query 'Eco-terrorism'
New query calibrated!
Using gtab_data\output\google_anchorbanks\google_anchorbank_geo=_timeframe=2012-01-01 2014-08-31.tsv
New query 'Pakistan'
New query calibrated!
Using gtab_data\output\google_anchorbanks\google_anchorbank_geo=_timeframe=2012-01-01 2014-08-31.tsv
New query 'Agro'

In [60]:
# View of created df
top_30_terrorism_df.head()

Unnamed: 0,date,max_ratio,max_ratio_hi,max_ratio_lo,article_name,geo
0,2012-01-01,1.575758,1.695447,1.463272,Al Qaeda,worldwide
1,2012-01-08,1.575758,1.695447,1.463272,Al Qaeda,worldwide
2,2012-01-15,1.818182,1.946625,1.697395,Al Qaeda,worldwide
3,2012-01-22,1.575758,1.695447,1.463272,Al Qaeda,worldwide
4,2012-01-29,1.454545,1.569859,1.34621,Al Qaeda,worldwide


In [61]:
# Save dataframe to pickle
top_30_terrorism_df.to_pickle(data_path+"top_30_terrorism.pkl")

In [15]:
dff = create_dataframe(["/g/11cr_hd3g5"], geo, t)


Using gtab_data\output\google_anchorbanks\google_anchorbank_geo=_timeframe=2012-01-01 2014-08-31.tsv
New query '/g/11cr_hd3g5'
New query calibrated!


In [17]:
dff.max_ratio.mean()

0.017962285714285726