# 8. Manual check

We need to be sure about the images and titles that we tagged as positive in the previous automatic proccesses. In order to do so, we need to manually verify those entries. This will be done through manual review in this notebook.

In [1]:
import pandas as pd
import json
import glob
import re
from IPython.display import Image, clear_output, Markdown

In [2]:
df = pd.read_csv("../output/dataset/3.complete-data-classified.csv")

We will check the pairs manually in a loop and save the results to JSON files.

In [3]:
def check_positive_imgs(df):
    '''
    Prints all images marked as racy in the dataframe
    and asks if the user agrees with the classification.
    Saves the answer in a JSON file.
    
    This function is used mainly to take a look at all
    pictures and see if there are visible systematic
    biases in Google's algorithm that could render
    the whole analysis useless.
    
    I do understand that this procedure is very subjective,
    that it's very hard to define what is racy and what is not
    and that this sort of manual classification is permeated by sexism.
    
    I hope, however, to  get a better grasp of the dataset by doing so,
    which will aid in the reporting. 
    '''
    
    # Selects racy images
    to_check = df[df.racy_bool]
    
    # Filter out images that have already been checked
    
    #### Create the path in which the file should be salved
    to_check["manual_check_path"] = ("../output/positives_manual_check/imgs/" + 
                                     to_check.search_query.str.replace(" ","-") + 
                                     "-" + to_check.position.astype(str) +
                                     ".json")
    
    #### Check if it's already there
    checked_files = glob.glob("../output/positives_manual_check/imgs/*.json")
    to_check = to_check[~to_check.manual_check_path.isin(checked_files)]
        
    for index, row in to_check.iterrows():
        
        # Store evaluation in a dictionary
        data = {}
        
        try:
            # Display image and ask for evaluation
            display(f"{row.search_query} {row.position}")
            display(Image(row.img_path, width=600))
        except ValueError:
            display("Display error")
            clear_output()
            continue
        
        answer = ""
        while answer not in ("y", "n"):
            answer = input("Do you agree that this image is racy? (y/n)")
        
        data["manual_check"] = True if answer == "y" else False
            
        clear_output()

        # Save both as JSON
        with open(row.manual_check_path, "w+") as f:
            json.dump(data, f, indent=4)
        
        fp = f"{row.search_query.replace(' ', '-')}-{row.position}.json"

In [4]:
check_positive_imgs(df)

We will also check a samall sample of negative images to see if we can find similar misclassification patterns in the items not marked as racy.

In [5]:
def sample_img_check(df, frac, subset):
    '''
    Randomly selects a percentage of rows
    from the dataframe, displays their
    images and asks the user for input
    regarding the accuracy of CloudVision
    SafeSearch results.
    
    As before, I do understand that this procedure 
    is very subjective and that it's very hard to define 
    what is racy and what is not. 
    
    Since the subset to check is smaller this time,
    we will save it directly to a CSV file.
    
    Parameters:
    
    df -> The dataframe to be checked
    frac -> The percentage of rows to be verified
    subset -> The labels that we will want to consider. Can be 'positive' or 'negative'
    '''
        
    # Select only rows that have real, downloaded images
    df = df[~(df.racy.isna())]
    df = df[~(df.img_path.str.contains("charset"))]
    
    if subset == 'positive':
        df = df[df.racy_bool]
    elif subset == 'negative':
        df = df[~df.racy_bool]
    else:
        raise ValueError

    # Ensures that the output path exists
    outpath = f"../output/negatives_manual_check/imgs/dataset/sample-check-{subset}.csv"
    with open(outpath, "w+") as f:
        pass
        
    # Keep only rows of interest to optimize behavior
    df = df[["img_path", "racy"]]

    random_state = 4121 # A fixed seed to ensure reproducibility
    sample = df.sample(frac=frac, random_state=random_state)
    
    # Placeholder for veredict
    sample["classification"] = None
    
    for index, row in sample.iterrows():
        
        if row.racy in ["LIKELY", "VERY_LIKELY"]:
            classification = "POSITIVE"
        elif row.racy in ["POSSIBLE", "UNLIKELY", "VERY_UNLIKELY"]:
            classification = "NEGATIVE"
        
        try:
            display(Image(row.img_path, width=300, height=300))        
            print(f"According to our criteria regardind Cloud Vision SafeSearch results, this image is a {classification} for racy")
        except ValueError:
            sample.loc[index, 'classification'] = 'display_error'
            continue
            
        
        answer = ""
        while answer not in ["t", "f"]:
            answer = input("Do you agree? Type 't', 'f'\n")
            answer = answer.lower()
        
        csv_row = f"{row.img_path},{row.racy},{answer}\n"
        with open(outpath, 'a') as fd:
            fd.write(csv_row)

        clear_output()
        
            
    return

In [6]:
#sample_img_check(df, .02, subset='negative')

And we will check all the titles that have the keywords we highlighted to see if they are actually refering to a sexualizing context. I'll also check the titles of the pictures marked as racy to see if there are websites that I'm missing, since they don't use the same exact keywords.

In [7]:
def check_titles(df):
    
    '''
    Prints each row's title and url, asking the 
    user if the content display refers to marriage,
    dating, sexual tourism or otherwise sexualized
    content. 
    '''
    
    # Selects both racy images and images with racy keywords
    to_check = df[(df.keywords_bool) | (df.racy_bool)]
    
    # Filter out images that have already been checked
    
    #### Create the path in which the file should be salved
    to_check["manual_check_path"] = ("../output/positives_manual_check/keywords/" + 
                                     to_check.search_query.str.replace(" ","-") + 
                                     "-" + to_check.position.astype(str) +
                                     ".json")
    
    #### Check if it's already there
    checked_files = glob.glob("../output/positives_manual_check/keywords/*.json")
    to_check = to_check[~to_check.manual_check_path.isin(checked_files)]
    
    total = to_check.shape[0]
        
    for index, row in to_check.reset_index().iterrows():
        
        display(f"{total} remaining pictures")
    
        # Store evaluation in a dictionary
        data = {}
        
        # Display image and ask for evaluation
        display(f"{row.search_query} {row.position}")
        display(Markdown(f"## {row.full_title}"))
        display(Markdown(f"**{row.source}**"))
        display(Markdown(f"**{row.link}**"))
        
        answer = ""
        while answer not in ("y", "n"):
            answer = input("Do you agree that this website is about dating, marriage, hook-up tourism or otherwise objectifying/sexualizing? (y/n)")
        
        data["manual_check"] = True if answer == "y" else False
        data["search_query"] = row.search_query.replace(" ",  "-")
        data["position"] = row.position
        
        total -= 1
        
        clear_output()

        # Save both as JSON
        with open(row.manual_check_path, "w+") as f:
            json.dump(data, f, indent=4)

In [8]:
check_titles(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
