# 10. Validating

This script will validate the results that our first data collection got by comapring it to data collections done by other people in other points in time.

In [1]:
import pandas as pd
import json
import glob
import os
import numpy as np
import re

In [2]:
pd.set_option('display.max_rows', 500)

In [13]:
def read_jsons(directory):
    '''
    Reads the JSON files with the Google Image Search results
    and stores them into a single dictionary, which is returned.
    '''
        
    # Grabs all the filepaths
    files = glob.glob(f"{directory}/*.json")
    
    # Reads all those files and save them into a new dictionary
    jsons = { }
    for file in files:
        
        with open(file) as f:
            data = json.load(f)
            
            query = data["search_parameters"]['q']
    
            results = data["images_results"]
                
            jsons[query] = results.copy()
            
    return jsons

In [14]:
def make_df(jsons):
    '''
    Turns a JSON-like array of dictionaries,
    produced by the function read_jsons(),
    into a single pandas datrame, which is returned.
    '''

    dfs = []
    
    # Saves both the values of each dictionary
    # and an identifying column with the respective search query
    for k,v in jsons.items():
        df = pd.DataFrame(v)
        df['search_query'] = k
        dfs.append(df)

    dfs = pd.concat(dfs, ignore_index=True)
    
    return dfs

#### Reads my data in

In [17]:
rm_df = make_df(read_jsons("../output/search_results/"))

## Comparisons

In [28]:
def calculate_overlap(df_a, df_b):
    
    # Creates unique series for images, urls and titles
    a_imgs = pd.Series(df_a["original"].unique())
    b_imgs = pd.Series(df_b["original"].unique())
    
    a_urls = pd.Series(df_a["link"].unique())
    b_urls = pd.Series(df_b["link"].unique())
        
    print("How many of the images in df_a are also in df_b?")
    
    display(
        a_imgs.isin(b_imgs).value_counts(normalize=True)
    )
    
    print("How many of the URLs in df_a are also in df_b?")

    display(
        a_urls.isin(b_urls).value_counts(normalize=True)
    )
    
    print("How many of the images in df_b are also in df_a?")
    
    display(
        b_imgs.isin(a_imgs).value_counts(normalize=True)
    )
    
    print("How many of the URLs in df_b are also in df_a?")

    display(
        b_urls.isin(a_urls).value_counts(normalize=True)
    )

### 1. Gianna Grün

In [35]:
gg_df = make_df(read_jsons("../validation_output/1/search_results/"))

In [36]:
calculate_overlap(rm_df, gg_df)

How many of the images in df_a are also in df_b?


True     0.760063
False    0.239937
dtype: float64

How many of the URLs in df_a are also in df_b?


True     0.767073
False    0.232927
dtype: float64

How many of the images in df_b are also in df_a?


True     0.759012
False    0.240988
dtype: float64

How many of the URLs in df_b are also in df_a?


True     0.766458
False    0.233542
dtype: float64

### 2. Thanasis Troboukis

In [31]:
tt_df = make_df(read_jsons("../validation_output/2/search_results/"))

In [32]:
calculate_overlap(rm_df, tt_df)

How many of the images in df_a are also in df_b?


True     0.66393
False    0.33607
dtype: float64

How many of the URLs in df_a are also in df_b?


True     0.681653
False    0.318347
dtype: float64

How many of the images in df_b are also in df_a?


True     0.663209
False    0.336791
dtype: float64

How many of the URLs in df_b are also in df_a?


True     0.676583
False    0.323417
dtype: float64

### 3. Vinicius Sueiro

In [33]:
vs_df = make_df(read_jsons("../validation_output/3/search_results/"))

In [34]:
calculate_overlap(rm_df, vs_df)

How many of the images in df_a are also in df_b?


True     0.665908
False    0.334092
dtype: float64

How many of the URLs in df_a are also in df_b?


True     0.682884
False    0.317116
dtype: float64

How many of the images in df_b are also in df_a?


True     0.665316
False    0.334684
dtype: float64

How many of the URLs in df_b are also in df_a?


True     0.677337
False    0.322663
dtype: float64

In [37]:
calculate_overlap(tt_df, vs_df)

How many of the images in df_a are also in df_b?


True     0.936524
False    0.063476
dtype: float64

How many of the URLs in df_a are also in df_b?


True     0.942626
False    0.057374
dtype: float64

How many of the images in df_b are also in df_a?


True     0.936709
False    0.063291
dtype: float64

How many of the URLs in df_b are also in df_a?


True     0.941976
False    0.058024
dtype: float64