# Milestone P4 - A comparative approach on chilling effect, from Wikipedia to Google Trend

## Course CS-401 - Applied Data Analysis

### Instructor : [Robert West](https://dlab.epfl.ch/people/west/)

### Author :  
- **Chraibi Ghali**  
    SCIPER: 262251
- **Jesslen Artur**  
    SCIPER: 270642
- **Michels Luc**  
    SCIPER: 273666  

#### Due date: 18 Dec 2020

### Imports

In [None]:
#!pip install pickle5
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas as pd
import numpy as np
import pickle5 as pickle

from os import path
import json
import requests

from helper import create_and_set_gtab, create_search_terms_to_GKG_node_df, fix_topics, get_json_structure

data_path = "data/"
base_request_prefix = "https://kgsearch.googleapis.com/v1/entities:search"

---
### Context
Lorem ipsum

---
### Scrapping of the data
Lorem ipsum

### Step 1: Create mapping from search queries to Google Knowledge Graph Search Node

In [None]:
MAPPING = False
if MAPPING:
    # Read API_key (you need your own key for it to work)
    with open(data_path+"API_key","r") as f:
        API_key = f.read()
    # Create simple request to show structure and test if functional
    params = {
        "query" : "iraq",
        "key"   : API_key,
        "limit" : 1,  
        "indent": True
    }
    r = requests.get(base_request_prefix, params = params)
    entity = r.json()
    print(f"JSON structure returned by the API:\n{get_json_structure(entity)}")

with open('data/search_queries.pkl', 'rb') as f:
    # store the data as binary data stream
    search_queries = pickle.load(f)

In [None]:
if MAPPING:
    # Terrorism dataset Mapping
    terrorism_mapping_df = create_search_terms_to_GKG_node_df(search_queries['terrorism'], 
        "terrorism", API_key)
    top_30_terrorism_mapping_df = create_search_terms_to_GKG_node_df(search_queries['top_30_terrorism'], 
        "top_30_terrorism", API_key)
    # Create id replacement dictionary
    id_replacement = {
        "/m/0gtxdb2" : "/g/11bc5q9v7r",
        "/m/04xkp"   : "/m/05gpf",
        "/m/01xmw0"  : "/g/121x751y",
        "/m/011ys5"  : "/m/06hvg"
    }

    # Create name replacement dictionary
    name_replacement = {
        "Attack on Titan" : "Attack",
        "Magnetic resonance imaging" : "Nuclear weapon",
        "Biological Weapons Convention" : "biological weapon",
        "Farce" : "Revolutionary Armed Forces of Colombia—People's Army"
    }

    # Create list of topics ids to delete
    topics_id_to_delete = [
        "/g/11k69f5spb",
        "/g/11cr_hd3g5",
        "/m/01vksx"
    ]
    # Fix ambiguous topics
    terrorism_mapping_df = fix_topics(terrorism_mapping_df, id_replacement, 
        name_replacement, topics_id_to_delete)
    top_30_terrorism_mapping_df = fix_topics(top_30_terrorism_mapping_df, 
        id_replacement, name_replacement, topics_id_to_delete)

In [None]:
if MAPPING:
    domestic_mapping_df = create_search_terms_to_GKG_node_df(search_queries['domestic'], "domestic", API_key)
    # Create id replacement dictionary
    domestic_id_replacement = { "/m/0y4n5ll" : "/m/0fynw"}
    # Create name replacement dictionary
    domestic_name_replacement = {"Kingsman: The Secret Service" : "United States Secret Service"}
    # Fix ambiguous topics
    domestic_mapping_df = fix_topics(domestic_mapping_df, domestic_id_replacement, domestic_name_replacement, [])

In [None]:
if MAPPING:
    all_mappings = [terrorism_mapping_df, domestic_mapping_df, top_30_terrorism_mapping_df]
    # Concatenate all mappings into one dataframe
    all_mappings_df = pd.concat(all_mappings).reset_index(drop=True)
    # Save dataframe to pickle
    all_mappings_df.to_pickle(data_path+"mapping.pkl")

### Step 2: Create dataframes from found topics and generated google trends anchor banks

In [None]:
def create_dataframe(domain_name, geo, t):
    """Creates a dataframe concatenating all search queries interest over time data from google
    trends for one domain. The search queries will use the corresponding topics to the DHS category for the domain.
    Prints parametters of queries who failed.
    The returned dataframe has attributes:
    {date, max_ratio, max_ratio_hi, max_ratio_l, article_name, topic_name, topic_id, geo}

    Args:
        domain_name (str): domain of the search queries (ie. terrorims, domestic, top_30_terrorism)       
        geo (str): geolocalisation of the search query
        t (GTAB): GoogleTrendsAnchorBank to use for the queries it needs to be consistent with the geo parameter

    Returns:
        dataframe: dataframe concatenating all search queries interest over time data from google
    trends

    """
    with open(data_path+"mapping.pkl", "rb") as file:
        mapping_df = pickle.load(file)
    
    # Get the list of the article names of this domain 
    topic_queries_articles = mapping_df[mapping_df["domain_name"] == domain_name]["search_term"].tolist()
    # Get the list of topic ids of this domain 
    topic_queries_ids = mapping_df[mapping_df["domain_name"] == domain_name]["entity_id"].tolist()
    # Get the list of topic names of this domain 
    topic_queries_name = mapping_df[mapping_df["domain_name"] == domain_name]["entity_name"].tolist()
    
    # For each search query freebase id create the corresponding interest over time google trends data
    all_interest_over_time_dfs = [t.new_query(search_query) for search_query in topic_queries_ids]
    
    # Find all queries who succeeded                 
    successful_queries = [type(df) != type(-1) for df in all_interest_over_time_dfs]
    
    df_to_concatenate = []
    # Append the name and location to all dataframes
    for i, df in enumerate(all_interest_over_time_dfs):
        
        # Was it a successful query
        if successful_queries[i]:
            
            # Add the article name collumn 
            df["article_name"] = [topic_queries_articles[i]]*len(df)
            # Add the topic name collumn 
            df["topic_id"] = [topic_queries_ids[i]]*len(df)
            # Add the topic id collumn 
            df["topic_name"] = [topic_queries_name[i]]*len(df)
            # Add the localisation collumn
            df["geo"] = ["worldwide" if geo == "" else geo]*len(df)
            
            df_to_concatenate.append(df)
            
        # If not we print the parametters that failed
        else:
            print("Found a failed query")
            print(f"Article name: {topic_queries_articles[i]}")
            print(f"Topic name: {topic_queries_name[i]}")
            print(f"Topic id: {topic_queries_ids[i]}")
    
    # Concatenate all dfs into one centrale one
    return pd.concat(df_to_concatenate).reset_index()
    
def get_topics(domain_name, data_path):
    """Gives the list of all topics corresponding to the search queries for the specific domain.
    
    Args:
         domain_name (str): domain name of the search queries (i.e: terrorism, domestic, top_30_terrorism)
         data_path (str): directory path containing the mapping pickle file 
    
    Returns:
        list of all topics for this domain
    """
    
    mapping_df = pd.read_pickle(data_path+"mapping.pkl")
    
    return mapping_df[mapping_df["domain_name"] == domain_name]["entity_id"].tolist()

In [None]:
CREATE_DATAFRAMES = False
if CREATE_DATAFRAMES:
    # We choose the US geolocalisation because categories are from the DHS
    for geo in ["US", "CA", "GB", "AU", "CH"]:

        # Create time frame corresponding to the paper study
        start_timeframe = "2012-01-01"
        end_timeframe   = "2014-08-31"

        t = create_and_set_gtab(start_timeframe, end_timeframe, geo)

        # Create terrorism dataframe
        terrorism_df = create_dataframe("terrorism", geo, t)
        # Create domestic dataframe
        domestic_df = create_dataframe("domestic", geo, t)
        # Create top-30 terrorism dataframe
        top_30_terrorism_df = create_dataframe("top_30_terrorism", geo, t)
        # Save dataframes to pickle
        terrorism_df.to_pickle(data_path+f"terrorism_{geo}.pkl")
        domestic_df.to_pickle(data_path+f"domestic_{geo}.pkl")
        top_30_terrorism_df.to_pickle(data_path+f"top_30_terrorism_{geo}.pkl")
        
        # ---------------------------------------------------------------

        # Set start to paper start timeframe and end to present time (Stop at November 2020)
        start_timeframe = "2012-01-01"
        end_timeframe   = "2020-11-30"

        t = create_and_set_gtab(start_timeframe, end_timeframe, geo)

        # Create terrorism dataframe
        terrorism_df = create_dataframe("terrorism", geo, t)
        # Create domestic dataframe
        domestic_df = create_dataframe("domestic", geo, t)
        # Create top-30 terrorism dataframe
        top_30_terrorism_df = create_dataframe("top_30_terrorism", geo, t)
        # Save dataframe to pickle
        terrorism_df.to_pickle(data_path+f"terrorism_present_{geo}.pkl")
        domestic_df.to_pickle(data_path+f"domestic_present_{geo}.pkl")
        top_30_terrorism_df.to_pickle(data_path+f"top_30_terrorism_present_{geo}.pkl")

---
### Replicate the experiment of the paper

#### Load data

In [None]:
data_path = 'data/'

if path.exists(data_path):
    with open(data_path+"terrorism_US.pkl", "rb") as f:
        terrorism_df = pickle.load(f)
    with open(data_path+"domestic_US.pkl", "rb") as f:
        domestic_df = pickle.load(f)
    with open(data_path+"top_30_terrorism_US.pkl", "rb") as f:
        top_30_terrorism_df = pickle.load(f)
else:
    raise NotADirectoryError

In [None]:
terrorism_df = terrorism_df[['date', 'max_ratio', 'topic_name']]
print(terrorism_df.shape)
terrorism_df.head(5)

#### Analyse data

In [None]:
# Define a constant for the date of revelation
revelation_date = pd.Timestamp('2013-06-15')

In [None]:
# Organise the data in a time series with the ratio of total views per week
terrorism_ts = terrorism_df.groupby(by='date').sum()
terrorism_ts = terrorism_ts.reset_index().rename(columns={'index': 'date'})

# Separate the data in two to perform an ITS analysis
terrorism_pre_june_ts = terrorism_ts[terrorism_ts['date'] < revelation_date]
terrorism_post_june_ts = terrorism_ts[terrorism_ts['date'] > revelation_date]

Step 1

In [None]:
print(np.mean(terrorism_pre_june_ts['max_ratio']))
print(np.mean(terrorism_post_june_ts['max_ratio']))
print(terrorism_post_june_ts.shape)
terrorism_pre_june_ts

In [None]:
mean_pre_june = np.mean(terrorism_pre_june_ts['max_ratio'])
mean_post_june = np.mean(terrorism_post_june_ts['max_ratio'])
plt.bar([0, 1], [mean_pre_june, mean_post_june], color='gray', width=0.4)
plt.xticks([0, 1], ['Pre June, 2013', 'Post June, 2013'])
plt.ylabel("Average interest")
plt.xlim(-0.5, 1.5)
plt.ylim(0, 200)
plt.text(-0.1, mean_pre_june + 5, f"{mean_pre_june:.2f}")
plt.text(0.9, mean_post_june + 5, f"{mean_post_june:.2f}")
plt.show()

Step 2

In [None]:
revelation_date_ord = revelation_date.toordinal()
terrorism_pre_june_ts["ord_date"] = terrorism_pre_june_ts.date.apply(lambda x: int((x.toordinal() - terrorism_pre_june_ts.date[0].toordinal())/7))


In [None]:
### TODO Modify description of the plot

# Visualisation of full time series
fig, ax = plt.subplots(figsize=(10, 6))

ax.set_title('Figure 2. Pre and Post June 2013 Topic Ratio Trends \n\n', fontsize=14)
ax.set_xlabel('\nTime (?Dates?)', fontsize=11)
ax.set_ylabel('Total views (All ?48? Google Articles)\n', fontsize=11)

ax.scatter(terrorism_ts.date, terrorism_ts['max_ratio'], 
    label='Total Articles Views (By Month)', color='black')

# Draw the regressions
slope_pre_june, intercept_pre_june, _, _, _ = stats.linregress(terrorism_pre_june_ts.index, 
                                                               terrorism_pre_june_ts['max_ratio'])
slope_post_june, intercept_post_june, _, _, _ = stats.linregress(terrorism_post_june_ts.index,
                                                                 terrorism_post_june_ts['max_ratio'])
x1 = terrorism_pre_june_ts.index
ax.plot(terrorism_pre_june_ts.date, intercept_pre_june + slope_pre_june*x1, 
    color='black', linewidth='3', label='Trend Pre-June 2013')
x2 = terrorism_post_june_ts.index
ax.plot(terrorism_post_june_ts.date, intercept_post_june + slope_post_june*x2, 
    color='gray', linewidth='3', label='Trend Post-June 2013')

# Emphasise when the revelations occured 
ax.axvline(x=revelation_date, ymin=0, ymax=280, color='black', linewidth='3')

x_text_offset = revelation_date - pd.Timedelta(50, 'd')
y_text_offset = 305
ax.text(x_text_offset, y_text_offset, 'Mid June 2013', fontsize=10)

ax.set_ylim(0, 300)

ax.legend(loc='lower left')
plt.show()

Step 3 - Compare