After performing this analysis on the data I collected from my computer, I'll do the same thing with data collected by Gianna Grün, the DW Data editor. This way, we can be sure that this kind of content was not being served by me because of my demographic profile.

In [1]:
import pandas as pd
import json
import glob
import os
import numpy as np
import re

#### Configs

In [2]:
pd.set_option('display.max_rows', 500)

## Reading resulst, title tags and image paths
The functions below come from '4.aggregate-raw-data.ipynb', but are slightly tweaked to read and parse data that is in a different directory.

In [3]:
def read_jsons():
    '''
    Reads the JSON files with the Google Image Search results
    and stores them into a single dictionary, which is returned.
    
    Params:
    
    gcg: True to use data from Gianna's data collection
    '''
    
    directory = "output_gcg"
    
    # Grabs all the filepaths
    files = glob.glob(f"../{directory}/search_results/*.json")
    
    # Reads all those files and save them into a new dictionary
    jsons = { }
    for file in files:
        
        with open(file) as f:
            data = json.load(f)
            
            query = data["search_parameters"]['q']
    
            results = data["images_results"]
                
            jsons[query] = results.copy()
            
    return jsons

In [4]:
def make_df(jsons):
    '''
    Turns a JSON-like array of dictionaries,
    produced by the function read_jsons(),
    into a single pandas datrame, which is returned.
    '''

    dfs = []
    
    # Saves both the values of each dictionary
    # and an identifying column with the respective search query
    for k,v in jsons.items():
        df = pd.DataFrame(v)
        df['search_query'] = k
        dfs.append(df)

    dfs = pd.concat(dfs, ignore_index=True)
    
    return dfs

In [5]:
def add_information(df):
    '''
    Adds both the local image filepath and the downloaded website 
    text to each row in the dataframe. If there is no image or text 
    file for that search result, the function fills the row with a None.
    '''
    
    def get_img_path(row):
        '''
        Fetches the local path of the image 
        downloaded for the row's search entry,
        if there is one. Returns a None if there's not.
        '''
        
        # We need to use the glob approach here to check if the file exists
        # because we can't be sure about the extension of the image file.
        pattern = f'../{directory}/imgs/{row.search_query.replace(" ","-")}/{row.position}.*'
        files = glob.glob(pattern)
        
        if files:
            assert len(files) == 1
            img_path = files[0]
            
        else:
            img_path = None
            
        return pd.Series({"img_path": img_path})
    
    
    def get_full_title(row):
        '''
        Fetches the title tag of a given JSON file.
        '''
        
        fname = f'../{directory}/link_contents/{row.search_query.replace(" ","-")}/{row.position}-title-tag.json'        
        
        if os.path.exists(fname):
            
            with open(fname, "r") as f:
                data = json.load(f)
                full_title = data["title"]
                
                # There is an entry with a \r in the title, which I have to remove
                full_title = full_title.replace("\r", " ")

            
        else:
            full_title = None
            
        return pd.Series({"full_title": full_title})
    
        
    directory = "output_gcg"
        
    df["img_path"] = df.apply(get_img_path, axis=1)
    df["full_title"] = df.apply(get_full_title, axis=1)
    
    return df

#### Saving Gianna's output as CSV

In [6]:
def create_df():
    jsons = read_jsons()
    df = make_df(jsons)
    df = add_information(df)
    df.to_csv("../output_gcg/dataset/aggregated-raw-data.csv", index=False)

In [7]:
create_df()

In [8]:
df_gcg = pd.read_csv("../output_gcg/dataset/aggregated-raw-data.csv")

## Safe search and geographical data

Now we can proceed to do the Google SafeSearch analysis on Gianna's files. We will also do this externally and then read the results back again. In the process, we will also merge them with the CSV that has ISO codes and country names.

In [9]:
def retrieve_safesearch_likelihood(row):
    
    # Build the path to the file with the data Cloud Vision API data
    search_query = row.search_query.replace(" ","-")
    position = row.position
        
    fpath = f"../output_gcg/vision_results/{search_query}-{position}.json"
    
    try:
        # Access the JSON file
        with open(fpath) as f:
            data = json.load(f)
            safe_search = data["safe_search_annotations"]
    except FileNotFoundError:
        return None
        
    results = {
        "adult": safe_search["adult"]["likelihood"],
        "racy": safe_search["racy"]["likelihood"],
        "spoof": safe_search["spoof"]["likelihood"],
        "violence": safe_search["violence"]["likelihood"],
        "medical": safe_search["medical"]["likelihood"]
    }
        
    return pd.Series(results)

In [10]:
def retrieve_m49_standard(df):
    
    # Reads the nationalities, adjectivals and demonyms csv
    nationalities = pd.read_csv("../input/nationalities.csv")
    
    # Reads and formats the CSV with the UN M49 standard classification
    m49 = pd.read_csv("../input/unsd-m49.tsv", sep='\t', dtype="str")

    m49 = m49[["Region Code", "Region Name", 
               "Sub-region Code", "Sub-region Name", 
               "Intermediate Region Code", "Intermediate Region Name", 
               "Country or Area", "M49 Code", "ISO-alpha3 Code"]]

    m49 = m49.rename(columns={
        "Region Code": "region_code",
        "Region Name": "region_name",
        "Sub-region Code": "sub_region_code",
        "Sub-region Name": "sub_region_name",
        "Country or Area": "country_or_area",
        "Intermediate Region Code": "intermediate_region_code",
        "Intermediate Region Name": "intermediate_region_name",
        "M49 Code": "m49_code",
        "ISO-alpha3 Code": "iso_a3"
    })
    
    # Merges them both
    nationalities = nationalities.merge(m49, left_on="country", right_on="country_or_area", how="left")
        
    # And manually fill the missing values of interest
    fill_values = [

         {'country': 'Bolivia', 'country_or_area': 'Bolivia',
          'adjectivals': 'Bolivian',
          'demonyms': 'Bolivians',
          'region_code': '019',
          'region_name': 'Americas',
          'sub_region_code': '419',
          'sub_region_name': 'Latin America and the Caribbean',
          'intermediate_region_code': '005',
          'intermediate_region_name': 'South America',
          'country_or_area': "Bolivia (Plurinational State of)",
          'm49_code': "068",
          'iso_a3': 'BOL'},

         {'country': 'Brunei','country_or_area': 'Brunei',
          'adjectivals': 'Bruneian',
          'demonyms': 'Bruneians',
          'region_code': "142",
          'region_name': "Asia",
          'sub_region_code': "035",
          'sub_region_name': "South-eastern Asia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Brunei Darussalam",
          'm49_code': "096",
          'iso_a3': 'BRN'},

         {'country': 'Czech Republic','country_or_area': 'Czech Republic',
          'adjectivals': 'Czech',
          'demonyms': 'Czechs',
          'region_code': "150",
          'region_name': "Europe",
          'sub_region_code': "151",
          'sub_region_name': "Eastern Europe",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Czechia",
          'm49_code': "203",
          'iso_a3': 'CZE'},

#          {'country': 'East Timor','country_or_area': 'East Timor',
#           'adjectivals': 'Timorese',
#           'demonyms': 'Timorese',
#           'region_code': "142",
#           'region_name': "Asia",
#           'sub_region_code': "035",
#           'sub_region_name': "South-eastern Asia",
#           'intermediate_region_code': np.nan,
#           'intermediate_region_name': np.nan,
#           'country_or_area': "Timor-Leste",
#           'm49_code': "626"},

         {'country': 'England',
          'adjectivals': 'English',
          'demonyms': 'English',
          'region_code': "150",
          'region_name': "Europe",
          'sub_region_code': "154",
          'sub_region_name': "Northern Europe",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': 'England',
          'm49_code': np.nan,
          'iso_a3': 'xENG'},

         {'country': 'Iran','country_or_area': 'Iran',
          'adjectivals': 'Iranian',
          'demonyms': 'Iranians',
          'region_code': "142",
          'region_name': "Asia",
          'sub_region_code': "034",
          'sub_region_name': "Southern Asia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Iran (Islamic Republic of)",
          'm49_code': "364",
          'iso_a3': 'IRN'},

         {'country': 'Ivory Coast','country_or_area': 'Ivory Coast',
          'adjectivals': 'Ivorian',
          'demonyms': 'Ivorians',
          'region_code': "002",
          'region_name': "Africa",
          'sub_region_code': "202",
          'sub_region_name': "Sub-Saharan Africa",
          'intermediate_region_code': "011",
          'intermediate_region_name': "Western Africa",
          'country_or_area': "Côte d’Ivoire",
          'm49_code': "384",
          'iso_a3': 'CIV'},

         {'country': 'North Korea','country_or_area': 'North Korea',
          'adjectivals': 'North Korean',
          'demonyms': 'North Koreans',
          'region_code': "142",
          'region_name': "Asia",
          'sub_region_code': "030",
          'sub_region_name': "Eastern Asia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Democratic People's Republic of Korea",
          'm49_code': "408",
          'iso_a3': 'PRK'
         },

         {'country': 'South Korea','country_or_area': 'South Korea',
          'adjectivals': 'South Korean',
          'demonyms': 'South Koreans',
          'region_code': "142",
          'region_name': "Asia",
          'sub_region_code': "030",
          'sub_region_name': "Eastern Asia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Republic of Korea",
          'm49_code': "410",
          'iso_a3': 'KOR'},

         {'country': 'Kosovo',
          'adjectivals': 'Kosovar',
          'demonyms': 'Kosovars',
          'region_code': "150",
          'region_name': "Europe",
          'sub_region_code': "039",
          'sub_region_name': "Southern Europe",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': 'Kosovo',
          'm49_code': np.nan,
          'iso_a3': "XKX"},

         {'country': 'Lao','country_or_area': 'Lao',
          'adjectivals': 'Laos',
          'demonyms': 'Laotians',
          'region_code': "142",
          'region_name': "Asia",
          'sub_region_code': "035",
          'sub_region_name': "South-eastern Asia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Lao People's Democratic Republic",
          'm49_code': "418",
          'iso_a3': 'LAO'},

         {'country': 'Macau','country_or_area': 'Macau',
          'adjectivals': 'Macanese',
          'demonyms': 'Chinese',
          'region_code': "142",
          'region_name': "Asia",
          'sub_region_code': "030",
          'sub_region_name': "Eastern Asia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "China, Macao Special Administrative Region",
          'm49_code': "446",
          'iso_a3': "MAC",
         },

         {'country': 'Micronesia','country_or_area': 'Micronesia',
          'adjectivals': 'Micronesian',
          'demonyms': 'Micronesians',
          'region_code': "009",
          'region_name': "Asia",
          'sub_region_code': "057",
          'sub_region_name': "Micronesia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Micronesia (Federated States of)",
          'm49_code': "583",
          'iso_a3': 'FSM'
         },

         {'country': 'Moldova','country_or_area': 'Moldova',
          'adjectivals': 'Moldovan',
          'demonyms': 'Moldovans',
          'region_code': "150",
          'region_name': "Europe",
          'sub_region_code': "151",
          'sub_region_name': "Eastern Europe",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Republic of Moldova",
          'm49_code': "498",
          'iso_a3': 'MDA'
         },

         {'country': 'Northern Ireland',
          'adjectivals': 'Northern Irish',
          'demonyms': 'Northern Irish',
          'region_code': "150",
          'region_name': "Europe",
          'sub_region_code': "154",
          'sub_region_name': "Northern Europe",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': 'Northern Ireland',
          'm49_code': np.nan,
          'iso_a3': 'IRN'
         },

         {'country': 'Palestine',
          'adjectivals': 'Palestinian',
          'demonyms': 'Palestinians',
          'region_code': "142",
          'region_name': "Asia",
          'sub_region_code': "145",
          'sub_region_name': "Western Asia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "State of Palestine",
          'm49_code': "275",
          'iso_a3': "PSE",
         },

         {'country': 'Russia',
          'adjectivals': 'Russian',
          'demonyms': 'Russians',
          'region_code': "150",
          'region_name': "Europe",
          'sub_region_code': "151",
          'sub_region_name': "Eastern Europe",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Russian Federation",
          'm49_code': "643",
          'iso_a3': 'RUS'},

         {'country': 'Sahrawi Arab Democratic Republic',
          'adjectivals': 'Western Saharan',
          'demonyms': 'Western Saharans',
          'region_code': "002",
          'region_name': "Africa",
          'sub_region_code': "015",
          'sub_region_name': "Northern Africa",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Western Sahara",
          'm49_code': "732",
          'iso_a3': 'ESH'
         },

         {'country': 'São Tomé and Príncipe','country_or_area': 'São Tomé and Príncipe',
          'adjectivals': 'São Toméan',
          'demonyms': 'São Toméans',
          'region_code': "002",
          'region_name': "Africa",
          'sub_region_code': "202",
          'sub_region_name': "Sub-Saharan Africa",
          'intermediate_region_code': "017",
          'intermediate_region_name': "Middle Africa",
          'country_or_area': "Sao Tome and Principe",
          'm49_code': "678",
          'iso_a3': 'STP'
         },

         {'country': 'Scotland',
          'adjectivals': 'Scottish',
          'demonyms': 'Scots',
          'region_code': "150",
          'region_name': "Europe",
          'sub_region_code': "154",
          'sub_region_name': "Northern Europe",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': 'Scotland',
          'm49_code': np.nan,
          'iso_a3': 'xSCO',
         },

         {'country': 'Syrian','country_or_area': 'Syrian',
          'adjectivals': 'Syrian',
          'demonyms': 'Syrians',
          'region_code': "142",
          'region_name': "Asia",
          'sub_region_code': "145",
          'sub_region_name': "Western Asia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Syrian Arab Republic",
          'm49_code': "760",
          'iso_a3': 'SYR'},

         {'country': 'Taiwan','country_or_area': 'Taiwan',
          'adjectivals': 'Taiwanese',
          'demonyms': 'Taiwanese',
          'region_code': "142",
          'region_name': "Asia",
          'sub_region_code': "030",
          'sub_region_name': "Eastern Asia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Taiwan",
          'm49_code': np.nan,
          'iso_a3': 'TWN'},

         {'country': 'Tanzania',
          'adjectivals': 'Tanzanian',
          'demonyms': 'Tanzanians',
          'region_code': "002",
          'region_name': "Africa",
          'sub_region_code': "202",
          'sub_region_name': "Sub-Saharan Africa",
          'intermediate_region_code': "014",
          'intermediate_region_name': "Eastern Africa",
          'country_or_area': "United Republic of Tanzania",
          'm49_code': "834",
          'iso_a3': 'TZA'},

         {'country': 'United Kingdom',
          'adjectivals': 'British',
          'demonyms': 'Britons',
          'region_code': "150",
          'region_name': "Europe",
          'sub_region_code': "154",
          'sub_region_name': "Northern Europe",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "United Kingdom of Great Britain and Northern Ireland",
          'm49_code': "826",
          'iso_a3': 'GBR'
         },

         {'country': 'Venezuela',
          'adjectivals': 'Venezuelan',
          'demonyms': 'Venezuelans',
          'region_code': "019",
          'region_name': "Americas",
          'sub_region_code': "419",
          'sub_region_name': "Latin America and the Caribbean",
          'intermediate_region_code': "005",
          'intermediate_region_name': "South America",
          'country_or_area': "Venezuela (Bolivarian Republic of)",
          'm49_code': "862",
          'iso_a3': 'VEN'
         },

         {'country': 'Vietnam',
          'adjectivals': 'Vietnamese',
          'demonyms': 'Vietnamese',
          'region_code': "142",
          'region_name': "Asia",
          'sub_region_code': "035",
          'sub_region_name': "South-eastern Asia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Viet Nam",
          'm49_code': "704",
          'iso_a3': 'VNM'
         },

         {'country': 'Wales',
          'adjectivals': 'Welsh',
          'demonyms': 'Walian',
          'region_code': "150",
          'region_name': "Europe",
          'sub_region_code': "154",
          'sub_region_name': "Northern Europe",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Wales",
          'm49_code': np.nan,
          'iso_a3': 'xWAL',
         },
    ]

    # Performs the replacement
    fill_values = pd.DataFrame(fill_values)
            
    nationalities = nationalities.dropna(subset=["m49_code"])
    
    nationalities = pd.concat([nationalities, fill_values]).reset_index()
        
    # Now we can select the  merged values and join them with the search result data
    nationalities["search_query"] = nationalities.adjectivals.str.lower() + " women"
    
    df = df.merge(nationalities, on='search_query')
    
    return df

In [11]:
def retrieve_m49_standard(df):
    
    # Reads the nationalities, adjectivals and demonyms csv
    nationalities = pd.read_csv("../input/nationalities.csv")
    
    # Reads and formats the CSV with the UN M49 standard classification
    m49 = pd.read_csv("../input/unsd-m49.tsv", sep='\t', dtype="str")

    m49 = m49[["Region Code", "Region Name", 
               "Sub-region Code", "Sub-region Name", 
               "Intermediate Region Code", "Intermediate Region Name", 
               "Country or Area", "M49 Code", "ISO-alpha3 Code"]]

    m49 = m49.rename(columns={
        "Region Code": "region_code",
        "Region Name": "region_name",
        "Sub-region Code": "sub_region_code",
        "Sub-region Name": "sub_region_name",
        "Country or Area": "country_or_area",
        "Intermediate Region Code": "intermediate_region_code",
        "Intermediate Region Name": "intermediate_region_name",
        "M49 Code": "m49_code",
        "ISO-alpha3 Code": "iso_a3"
    })
    
    # Merges them both
    nationalities = nationalities.merge(m49, left_on="country", right_on="country_or_area", how="left")
        
    # And manually fill the missing values of interest
    fill_values = [

         {'country': 'Bolivia', 'country_or_area': 'Bolivia',
          'adjectivals': 'Bolivian',
          'demonyms': 'Bolivians',
          'region_code': '019',
          'region_name': 'Americas',
          'sub_region_code': '419',
          'sub_region_name': 'Latin America and the Caribbean',
          'intermediate_region_code': '005',
          'intermediate_region_name': 'South America',
          'country_or_area': "Bolivia (Plurinational State of)",
          'm49_code': "068",
          'iso_a3': 'BOL'},

         {'country': 'Brunei','country_or_area': 'Brunei',
          'adjectivals': 'Bruneian',
          'demonyms': 'Bruneians',
          'region_code': "142",
          'region_name': "Asia",
          'sub_region_code': "035",
          'sub_region_name': "South-eastern Asia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Brunei Darussalam",
          'm49_code': "096",
          'iso_a3': 'BRN'},

         {'country': 'Czech Republic','country_or_area': 'Czech Republic',
          'adjectivals': 'Czech',
          'demonyms': 'Czechs',
          'region_code': "150",
          'region_name': "Europe",
          'sub_region_code': "151",
          'sub_region_name': "Eastern Europe",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Czechia",
          'm49_code': "203",
          'iso_a3': 'CZE'},

#          {'country': 'East Timor','country_or_area': 'East Timor',
#           'adjectivals': 'Timorese',
#           'demonyms': 'Timorese',
#           'region_code': "142",
#           'region_name': "Asia",
#           'sub_region_code': "035",
#           'sub_region_name': "South-eastern Asia",
#           'intermediate_region_code': np.nan,
#           'intermediate_region_name': np.nan,
#           'country_or_area': "Timor-Leste",
#           'm49_code': "626"},

         {'country': 'England',
          'adjectivals': 'English',
          'demonyms': 'English',
          'region_code': "150",
          'region_name': "Europe",
          'sub_region_code': "154",
          'sub_region_name': "Northern Europe",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': 'England',
          'm49_code': np.nan,
          'iso_a3': 'xENG'},

         {'country': 'Iran','country_or_area': 'Iran',
          'adjectivals': 'Iranian',
          'demonyms': 'Iranians',
          'region_code': "142",
          'region_name': "Asia",
          'sub_region_code': "034",
          'sub_region_name': "Southern Asia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Iran (Islamic Republic of)",
          'm49_code': "364",
          'iso_a3': 'IRN'},

         {'country': 'Ivory Coast','country_or_area': 'Ivory Coast',
          'adjectivals': 'Ivorian',
          'demonyms': 'Ivorians',
          'region_code': "002",
          'region_name': "Africa",
          'sub_region_code': "202",
          'sub_region_name': "Sub-Saharan Africa",
          'intermediate_region_code': "011",
          'intermediate_region_name': "Western Africa",
          'country_or_area': "Côte d’Ivoire",
          'm49_code': "384",
          'iso_a3': 'CIV'},

         {'country': 'North Korea','country_or_area': 'North Korea',
          'adjectivals': 'North Korean',
          'demonyms': 'North Koreans',
          'region_code': "142",
          'region_name': "Asia",
          'sub_region_code': "030",
          'sub_region_name': "Eastern Asia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Democratic People's Republic of Korea",
          'm49_code': "408",
          'iso_a3': 'PRK'
         },

         {'country': 'South Korea','country_or_area': 'South Korea',
          'adjectivals': 'South Korean',
          'demonyms': 'South Koreans',
          'region_code': "142",
          'region_name': "Asia",
          'sub_region_code': "030",
          'sub_region_name': "Eastern Asia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Republic of Korea",
          'm49_code': "410",
          'iso_a3': 'KOR'},

         {'country': 'Kosovo',
          'adjectivals': 'Kosovar',
          'demonyms': 'Kosovars',
          'region_code': "150",
          'region_name': "Europe",
          'sub_region_code': "039",
          'sub_region_name': "Southern Europe",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': 'Kosovo',
          'm49_code': np.nan,
          'iso_a3': "XKX"},

         {'country': 'Lao','country_or_area': 'Lao',
          'adjectivals': 'Laos',
          'demonyms': 'Laotians',
          'region_code': "142",
          'region_name': "Asia",
          'sub_region_code': "035",
          'sub_region_name': "South-eastern Asia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Lao People's Democratic Republic",
          'm49_code': "418",
          'iso_a3': 'LAO'},

         {'country': 'Macau','country_or_area': 'Macau',
          'adjectivals': 'Macanese',
          'demonyms': 'Chinese',
          'region_code': "142",
          'region_name': "Asia",
          'sub_region_code': "030",
          'sub_region_name': "Eastern Asia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "China, Macao Special Administrative Region",
          'm49_code': "446",
          'iso_a3': "MAC",
         },

         {'country': 'Micronesia','country_or_area': 'Micronesia',
          'adjectivals': 'Micronesian',
          'demonyms': 'Micronesians',
          'region_code': "009",
          'region_name': "Asia",
          'sub_region_code': "057",
          'sub_region_name': "Micronesia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Micronesia (Federated States of)",
          'm49_code': "583",
          'iso_a3': 'FSM'
         },

         {'country': 'Moldova','country_or_area': 'Moldova',
          'adjectivals': 'Moldovan',
          'demonyms': 'Moldovans',
          'region_code': "150",
          'region_name': "Europe",
          'sub_region_code': "151",
          'sub_region_name': "Eastern Europe",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Republic of Moldova",
          'm49_code': "498",
          'iso_a3': 'MDA'
         },

         {'country': 'Northern Ireland',
          'adjectivals': 'Northern Irish',
          'demonyms': 'Northern Irish',
          'region_code': "150",
          'region_name': "Europe",
          'sub_region_code': "154",
          'sub_region_name': "Northern Europe",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': 'Northern Ireland',
          'm49_code': np.nan,
          'iso_a3': 'IRN'
         },

         {'country': 'Palestine',
          'adjectivals': 'Palestinian',
          'demonyms': 'Palestinians',
          'region_code': "142",
          'region_name': "Asia",
          'sub_region_code': "145",
          'sub_region_name': "Western Asia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "State of Palestine",
          'm49_code': "275",
          'iso_a3': "PSE",
         },

         {'country': 'Russia',
          'adjectivals': 'Russian',
          'demonyms': 'Russians',
          'region_code': "150",
          'region_name': "Europe",
          'sub_region_code': "151",
          'sub_region_name': "Eastern Europe",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Russian Federation",
          'm49_code': "643",
          'iso_a3': 'RUS'},

         {'country': 'Sahrawi Arab Democratic Republic',
          'adjectivals': 'Western Saharan',
          'demonyms': 'Western Saharans',
          'region_code': "002",
          'region_name': "Africa",
          'sub_region_code': "015",
          'sub_region_name': "Northern Africa",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Western Sahara",
          'm49_code': "732",
          'iso_a3': 'ESH'
         },

         {'country': 'São Tomé and Príncipe','country_or_area': 'São Tomé and Príncipe',
          'adjectivals': 'São Toméan',
          'demonyms': 'São Toméans',
          'region_code': "002",
          'region_name': "Africa",
          'sub_region_code': "202",
          'sub_region_name': "Sub-Saharan Africa",
          'intermediate_region_code': "017",
          'intermediate_region_name': "Middle Africa",
          'country_or_area': "Sao Tome and Principe",
          'm49_code': "678",
          'iso_a3': 'STP'
         },

         {'country': 'Scotland',
          'adjectivals': 'Scottish',
          'demonyms': 'Scots',
          'region_code': "150",
          'region_name': "Europe",
          'sub_region_code': "154",
          'sub_region_name': "Northern Europe",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': 'Scotland',
          'm49_code': np.nan,
          'iso_a3': 'xSCO',
         },

         {'country': 'Syrian','country_or_area': 'Syrian',
          'adjectivals': 'Syrian',
          'demonyms': 'Syrians',
          'region_code': "142",
          'region_name': "Asia",
          'sub_region_code': "145",
          'sub_region_name': "Western Asia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Syrian Arab Republic",
          'm49_code': "760",
          'iso_a3': 'SYR'},

         {'country': 'Taiwan','country_or_area': 'Taiwan',
          'adjectivals': 'Taiwanese',
          'demonyms': 'Taiwanese',
          'region_code': "142",
          'region_name': "Asia",
          'sub_region_code': "030",
          'sub_region_name': "Eastern Asia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Taiwan",
          'm49_code': np.nan,
          'iso_a3': 'TWN'},

         {'country': 'Tanzania',
          'adjectivals': 'Tanzanian',
          'demonyms': 'Tanzanians',
          'region_code': "002",
          'region_name': "Africa",
          'sub_region_code': "202",
          'sub_region_name': "Sub-Saharan Africa",
          'intermediate_region_code': "014",
          'intermediate_region_name': "Eastern Africa",
          'country_or_area': "United Republic of Tanzania",
          'm49_code': "834",
          'iso_a3': 'TZA'},

         {'country': 'United Kingdom',
          'adjectivals': 'British',
          'demonyms': 'Britons',
          'region_code': "150",
          'region_name': "Europe",
          'sub_region_code': "154",
          'sub_region_name': "Northern Europe",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "United Kingdom of Great Britain and Northern Ireland",
          'm49_code': "826",
          'iso_a3': 'GBR'
         },

         {'country': 'Venezuela',
          'adjectivals': 'Venezuelan',
          'demonyms': 'Venezuelans',
          'region_code': "019",
          'region_name': "Americas",
          'sub_region_code': "419",
          'sub_region_name': "Latin America and the Caribbean",
          'intermediate_region_code': "005",
          'intermediate_region_name': "South America",
          'country_or_area': "Venezuela (Bolivarian Republic of)",
          'm49_code': "862",
          'iso_a3': 'VEN'
         },

         {'country': 'Vietnam',
          'adjectivals': 'Vietnamese',
          'demonyms': 'Vietnamese',
          'region_code': "142",
          'region_name': "Asia",
          'sub_region_code': "035",
          'sub_region_name': "South-eastern Asia",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Viet Nam",
          'm49_code': "704",
          'iso_a3': 'VNM'
         },

         {'country': 'Wales',
          'adjectivals': 'Welsh',
          'demonyms': 'Walian',
          'region_code': "150",
          'region_name': "Europe",
          'sub_region_code': "154",
          'sub_region_name': "Northern Europe",
          'intermediate_region_code': np.nan,
          'intermediate_region_name': np.nan,
          'country_or_area': "Wales",
          'm49_code': np.nan,
          'iso_a3': 'xWAL',
         },
    ]

    # Performs the replacement
    fill_values = pd.DataFrame(fill_values)
            
    nationalities = nationalities.dropna(subset=["m49_code"])
    
    nationalities = pd.concat([nationalities, fill_values]).reset_index()
        
    # Now we can select the  merged values and join them with the search result data
    nationalities["search_query"] = nationalities.adjectivals.str.lower() + " women"
    
    df = df.merge(nationalities, on='search_query')
    
    return df

In [12]:
# Adds the count of all images fetched succesfully
def add_counts(df):
    
    # Dict that will hold the count of images and results for each search query
    imgs_fetched = {}
    results_fetched = {}
    
    # For each unique query...
    for search_query in df.search_query.unique():
        
        # Compute how many images were fetched and save it into the dictionary
        img_count = df[(df.search_query == search_query) & ~(df.img_path.isna())].shape[0]
        imgs_fetched[search_query] = img_count
        
        # Do the same for the results
        results_count = df[(df.search_query == search_query)].shape[0]
        results_fetched[search_query] = results_count

    # Creates a new column by matching the search query column with the dictionaries above
    df["imgs_fetched"] = df.search_query.map(imgs_fetched)
    df["results_fetched"] = df.search_query.map(results_fetched)

    
    return df

In [13]:
def add_safe_search():
    
    df = pd.read_csv("../output_gcg/dataset/aggregated-raw-data.csv")
    
    df = pd.concat([df, df.apply(retrieve_safesearch_likelihood, axis=1)], axis=1)
        
    df = retrieve_m49_standard(df)
        
    df.to_csv("../output_gcg/dataset/complete-data.csv", index=False)

In [14]:
add_safe_search()

In [15]:
df_gcg = pd.read_csv("../output_gcg/dataset/complete-data.csv")

## Adding keyword check

Let's do the same with the keyword search. Again, this code is taken from one of the previous scripts (this time, 7.keyword-lookup.ipynb) with slight adaptations. 

In [16]:
keywords = [
    "sexy", "hot", "hottest",
    #"beauty", "beautiful",
    "sex", "laid", "fuck",
    "marry", "marriage", "bride", "brides", "wife", "wives", "mail", "order", 
    "dating", "date", "meet", "single",
]

In [17]:
# Adds word boundaries:
keywords = [r"\b" + word + r"\b" for word in keywords]

In [18]:
capture = re.compile("|".join(keywords))

In [19]:
# Fill the nans with an empty string
df_gcg["full_title"] = df_gcg.full_title.fillna("")


In [20]:
df_gcg["keywords_bool"] = df_gcg.full_title.str.lower().apply(lambda x: True if capture.search(x) else False)

In [21]:
df_gcg.to_csv("../output_gcg/dataset/complete-data-classified.csv", index=False)

## Comparing the two datasets

First, let's take a look at the two datasets and check in which ways they differ.

In [22]:
df_gcg = pd.read_csv("../output_gcg/dataset/complete-data-classified.csv")
df_rm = pd.read_csv("../output/dataset/3.complete-data-classified.csv")

How many entries has Gianna's dataset?

In [23]:
df_gcg.shape[0]

20570

What about mine?

In [24]:
df_rm.shape[0]

20561

Her dataframe has 9 more entries. Why is that?

In [25]:
df_gcg.search_query.value_counts()

congolese women             200
dominican women             200
swiss women                 100
vietnamese women            100
montenegrin women           100
israeli women               100
grenadian women             100
nicaraguan women            100
belgian women               100
moldovan women              100
finnish women               100
ecuadorian women            100
kuwaiti women               100
cypriot women               100
solomon island women        100
beninese women              100
puerto rican women          100
samoan women                100
papua new guinean women     100
gambian women               100
bhutanese women             100
bulgarian women             100
myanma women                100
comorian women              100
tuvaluan women              100
thai women                  100
estonian women              100
portuguese women            100
ethiopian women             100
kyrgyzstani women           100
swedish women               100
polish w

In [26]:
df_rm.search_query.value_counts()

congolese women             200
dominican women             200
swiss women                 100
vietnamese women            100
montenegrin women           100
israeli women               100
grenadian women             100
nicaraguan women            100
belgian women               100
moldovan women              100
finnish women               100
ecuadorian women            100
kuwaiti women               100
cypriot women               100
solomon island women        100
beninese women              100
puerto rican women          100
samoan women                100
papua new guinean women     100
gambian women               100
bhutanese women             100
bulgarian women             100
myanma women                100
comorian women              100
tuvaluan women              100
thai women                  100
estonian women              100
portuguese women            100
ethiopian women             100
kyrgyzstani women           100
swedish women               100
polish w

Notice how the difference comes from some search queries that returned fewer or more results on this countries: Macau, Bostwana, São Tomé and Liechtenstein. 

Let's take a look at the unique images, titles and links.

In [27]:
giannas_imgs = pd.Series(df_gcg.original.unique())
rodrigos_imgs = pd.Series(df_rm.original.unique())

In [28]:
giannas_titles = pd.Series(df_gcg.full_title.unique())
rodrigos_titles = pd.Series(df_rm.full_title.unique())

In [29]:
giannas_links = pd.Series(df_gcg.link.unique())
rodrigos_links = pd.Series(df_rm.link.unique())

#### First, the images

How many unique images from Gianna's dataset are also on mine?

In [30]:
giannas_imgs.isin(rodrigos_imgs).value_counts()

True     15370
False     4880
dtype: int64

In [31]:
giannas_imgs.isin(rodrigos_imgs).value_counts(normalize=True)

True     0.759012
False    0.240988
dtype: float64

How many unique images from my dataset are also on  Gianna's?

In [32]:
rodrigos_imgs.isin(giannas_imgs).value_counts()

True     15370
False     4852
dtype: int64

In [33]:
rodrigos_imgs.isin(giannas_imgs).value_counts(normalize=True)

True     0.760063
False    0.239937
dtype: float64

Notice that the ammount of images that are in both datasets is equal (15370), as expected. The difference in the Falses is not of 9, however. This is happening because the difference in *entries* is of 9, but not in *unique items*. That difference is 28, as expected. 

**What about the titles?**

Let's check how many of Gianna's titles that are also on my dataset.

In [34]:
giannas_titles.isin(rodrigos_titles).value_counts()

True     13134
False     4239
dtype: int64

In [35]:
giannas_titles.isin(rodrigos_titles).value_counts(normalize=True)

True     0.756001
False    0.243999
dtype: float64

And now how many of my titles that are also on Gianna's dataset.

In [36]:
rodrigos_titles.isin(giannas_titles).value_counts()

True     13134
False     4549
dtype: int64

In [37]:
rodrigos_titles.isin(giannas_titles).value_counts(normalize=True)

True     0.742747
False    0.257253
dtype: float64

#### And, finally, the links

Gianna's links that are also on my dataset:

In [38]:
giannas_links.isin(rodrigos_links).value_counts()

True     14332
False     4367
dtype: int64

My links that are also on Gianna's dataset:

In [39]:
rodrigos_links.isin(giannas_links).value_counts(normalize=True)

True     0.767073
False    0.232927
dtype: float64

Overall, there as a overlap of around 75%.

Let's take a look at the ammount of content with racy image and objectfying keywords.

#### First, the racy images

In [40]:
df_gcg.racy.isin(["VERY_LIKELY", "LIKELY"]).value_counts(normalize=True)

False    0.900972
True     0.099028
Name: racy, dtype: float64

In [41]:
df_rm.racy.isin(["VERY_LIKELY", "LIKELY"]).value_counts(normalize=True)

False    0.899956
True     0.100044
Name: racy, dtype: float64

Is there a difference in specific countries, though?

In [42]:
def get_percentages(df, eval_col, condition, group_col=None, join_cols=None, normalize=True, zeroes=True):
    '''
    Gets the ammount of rows in a dataframe
    that meet a given condition, with
    additional support for groupbys.
    
    Parameters:
    
    df -> The pandas daframe to be aggregated
    eval_col -> The column whose value will be evaluated.
    condition -> A boolean lambda funcion that will be applied to the values of the evaluate_column
    group_col -> The columns by which the dataframe can be grouped by. Optional.
    join_cols -> Whether or not the resulting dataframe should be joined with other informative columns
    normalize -> Pass 'False' to have an absolute count instead
    dropna -> Whether or not an counts should be included in the resutling dataframe
    zeroes -> Whether or not the resulting dataframe should include countries with zero value counts for the condition
    '''

    if group_col:
                    
            percentages = (df.set_index(group_col)
                           [eval_col]
                           .apply(condition)
                           .groupby(level=0)
                           .value_counts(normalize=normalize)
                           .to_frame()
                           .rename(columns={eval_col: f'percentage_{eval_col}'})
                           .reset_index())

            percentages = percentages[percentages[eval_col]==True].reset_index(drop=True)
            percentages = percentages.drop(columns=eval_col)
            percentages = percentages.sort_values(by=f'percentage_{eval_col}', ascending=False)

        
    else:
        
        percentages = (df[eval_col]
                       .apply(condition)
                       .value_counts(normalize=normalize)
                       .to_frame()
                       .rename(columns={eval_col: f'percentage_{eval_col}'})
                       .reset_index()
                       .rename(columns={'index': 'bool'}))
        
        percentages = percentages[percentages['bool']==True].reset_index(drop=True)
        percentages = percentages.drop(columns='bool')
        
        
    if zeroes:
        
        zero_count_items = df[~df[group_col].isin(percentages[group_col])]
        zero_count_items = zero_count_items[group_col].unique()
        
        zero_count_items =  [ {   group_col: item,
                                  f"percentage_{eval_col}": 0
                              } for item in zero_count_items ]
        
        zero_count_items = pd.DataFrame(zero_count_items)
        
        percentages = pd.concat([percentages, zero_count_items])
        
        
    if join_cols:
        
        join_cols.append(group_col)
        
        info = (df.drop_duplicates(subset=group_col)
               [join_cols])
        
        percentages = percentages.merge(info, on=group_col, how='left')
        

        
    return percentages

Let's look at the keywords first.

In [43]:
results_gcg = get_percentages(df_gcg, 
                eval_col='keywords_bool', 
                condition=lambda x: x is True, # This is simply checking if the boolean is true. We can pass any other condition, though
                group_col='search_query',
                join_cols=[])

In [44]:
results_rm = get_percentages(df_rm, 
                eval_col='keywords_bool', 
                condition=lambda x: x is True, # This is simply checking if the boolean is true. We can pass any other condition, though
                group_col='search_query',
                join_cols=[])

In [45]:
results = results_gcg.merge(results_rm, on='search_query', suffixes=["_gcg", "_rm"])

In [46]:
results["diff"] = (results.percentage_keywords_bool_gcg - results.percentage_keywords_bool_rm).abs()

In [47]:
results.sort_values(by='diff', ascending=False)

Unnamed: 0,search_query,percentage_keywords_bool_gcg,percentage_keywords_bool_rm,diff
23,uruguayan women,0.27,0.39,0.12
139,andorran women,0.06,0.17,0.11
4,latvian women,0.43,0.52,0.09
6,filipino women,0.4,0.32,0.08
50,kyrgyzstani women,0.19,0.26,0.07
2,czech women,0.49,0.55,0.06
62,japanese women,0.16,0.22,0.06
28,serbian women,0.26,0.32,0.06
5,colombian women,0.41,0.46,0.05
40,armenian women,0.21,0.26,0.05


Notice how almost each country had a difference of under 5 percentage points.