# 9. Data analysis

After manually reviewing all entries marked as positive and a representative sample of the entries marked as negative, I feel confident enough to start doing some proper data analysis.

#### Importing packages

In [1]:
import pandas as pd
import json
import glob
import geopandas as gpd
import json
import os
%matplotlib inline

#### Data

In [2]:
df = pd.read_csv("../output/dataset/complete-data-classified.csv")

#### Configs

In [3]:
pd.set_option('display.max_rows', 500)

## How many results do we have?

In [4]:
df.shape[0]

1000

## Checking collection succesfulness rate

How many valid search results do we have for each country?

Notice that the countries of Dominica and Dominican Republic had the same reuslts, since they were produced by the same search query ("dominican women")

In [5]:
df.groupby("search_query").search_query.count()

search_query
deutsche frauen         100
femmes françaises       100
mujeres colombianas     100
mujeres dominicanas     100
mulheres brasileiras    100
pinay                   100
české ženy              100
жінки з україни         100
русские женщины         100
ผู้หญิงไทย              100
Name: search_query, dtype: int64

How many images were succesfully downloaded for each nation?

In [6]:
df.groupby("search_query").img_path.count().sort_values(ascending=False)

search_query
ผู้หญิงไทย              100
русские женщины         100
жінки з україни         100
české ženy              100
pinay                   100
mulheres brasileiras    100
mujeres dominicanas     100
mujeres colombianas     100
femmes françaises       100
deutsche frauen          98
Name: img_path, dtype: int64

And how many title tags were succesfully retrieved?

In [7]:
df.groupby("search_query").full_title.count().sort_values(ascending=False)

search_query
русские женщины         100
жінки з україни         100
české ženy              100
pinay                   100
mulheres brasileiras    100
mujeres dominicanas     100
mujeres colombianas     100
femmes françaises       100
ผู้หญิงไทย               99
deutsche frauen          98
Name: full_title, dtype: int64

How many entries have both images and titles succesfully extracted, by country?

In [8]:
df[~(df.img_path.isna()) & ~(df.full_title.isna())].groupby("search_query").img_path.count().sort_values(ascending=False)

search_query
русские женщины         100
жінки з україни         100
české ženy              100
pinay                   100
mulheres brasileiras    100
mujeres dominicanas     100
mujeres colombianas     100
femmes françaises       100
ผู้หญิงไทย               99
deutsche frauen          97
Name: img_path, dtype: int64

## Quantifying racy imagery

First, let's determine how many images we have in total and how many were tagged as racy.

In [9]:
def quantify(df, eval_col, conditions, descriptions):
    '''
    Prints how many rows of the dataframe met
    each specified condition and their human-
    readable descrpitions.
    
    Parameteres:
    
    df -> A pandas daframe
    eval_col -> The column which will be evaluated
    condition -> A lambda function to be applied in that column, which will return either True or False
    description -> A human-language description of the condition checked
    '''
    
    assert len(conditions) == len(descriptions)
    
    for condition, description in zip(conditions, descriptions):
        
        df["condition_bool"] = df[eval_col].apply(condition)
        count = df[df.condition_bool].shape[0]
        
        print(f"{count} {description}")

In [10]:
quantify(df=df, 
         eval_col="racy_bool", 
         conditions=[lambda x: x is True, lambda x: x is False], # This checks the value in the boolean column
         descriptions=["rows have racy images", "rows don't have racy images"])

132 rows have racy images
868 rows don't have racy images


This being said, let's take a look at the countries that have a higher percentage of images tagged as racy.

In [11]:
def get_percentages(df, eval_col, condition, group_col=None, join_cols=None, normalize=True, zeroes=True):
    '''
    Gets the ammount of rows in a dataframe
    that meet a given condition, with
    additional support for groupbys.
    
    Parameters:
    
    df -> The pandas daframe to be aggregated
    eval_col -> The column whose value will be evaluated.
    condition -> A boolean lambda funcion that will be applied to the values of the evaluate_column
    group_col -> The columns by which the dataframe can be grouped by. Optional.
    join_cols -> Whether or not the resulting dataframe should be joined with other informative columns
    normalize -> Pass 'False' to have an absolute count instead
    dropna -> Whether or not an counts should be included in the resutling dataframe
    zeroes -> Whether or not the resulting dataframe should include countries with zero value counts for the condition
    '''

    if group_col:
                    
            percentages = (df.set_index(group_col)
                           [eval_col]
                           .apply(condition)
                           .groupby(level=0)
                           .value_counts(normalize=normalize)
                           .to_frame()
                           .rename(columns={eval_col: f'percentage_{eval_col}'})
                           .reset_index())

            percentages = percentages[percentages[eval_col]==True].reset_index(drop=True)
            percentages = percentages.drop(columns=eval_col)
            percentages = percentages.sort_values(by=f'percentage_{eval_col}', ascending=False)

        
    else:
        
        percentages = (df[eval_col]
                       .apply(condition)
                       .value_counts(normalize=normalize)
                       .to_frame()
                       .rename(columns={eval_col: f'percentage_{eval_col}'})
                       .reset_index()
                       .rename(columns={'index': 'bool'}))
        
        percentages = percentages[percentages['bool']==True].reset_index(drop=True)
        percentages = percentages.drop(columns='bool')
        
        
    if zeroes:
        
        zero_count_items = df[~df[group_col].isin(percentages[group_col])]
        zero_count_items = zero_count_items[group_col].unique()
        
        zero_count_items =  [ {   group_col: item,
                                  f"percentage_{eval_col}": 0
                              } for item in zero_count_items ]
        
        zero_count_items = pd.DataFrame(zero_count_items)
        
        percentages = pd.concat([percentages, zero_count_items])
        
        
    if join_cols:
        
        join_cols.append(group_col)
        
        info = (df.drop_duplicates(subset=group_col)
               [join_cols])
        
        percentages = percentages.merge(info, on=group_col, how='left')
        

        
    return percentages

#### All countries

In [12]:
get_percentages(df, 
                eval_col='racy_bool', 
                condition=lambda x: x is True, # This is simply checking if the boolean is true. We can pass any other condition, though
                group_col='country_or_area',
                join_cols=['iso_a3'])

Unnamed: 0,country_or_area,percentage_racy_bool,iso_a3
0,Colombia,0.27,COL
1,Philippines,0.27,PHL
2,Dominica,0.22,DMA
3,Czechia,0.12,CZE
4,Thailand,0.12,THA
5,France,0.1,FRA
6,Brazil,0.09,BRA
7,Russian Federation,0.07,RUS
8,Germany,0.06,DEU
9,Ukraine,0.0,UKR


# Comparison with regular english searches

I want to compare the results in English with the resuls in local languages.

In [15]:
eng_results = pd.read_csv("../../output/dataset/complete-data-manually-checked.csv")

In [16]:
safesearch_local = get_percentages(df, 
                    eval_col='racy_bool', 
                    condition=lambda x: x is True, # This is simply checking if the boolean is true. We can pass any other condition, though
                    group_col='country_or_area',
                    join_cols=['iso_a3'])

In [17]:
safesearch_eng = get_percentages(eng_results, 
                    eval_col='racy_bool', 
                    condition=lambda x: x is True, # This is simply checking if the boolean is true. We can pass any other condition, though
                    group_col='country_or_area',
                    join_cols=['iso_a3'])

In [18]:
merged = safesearch_local.merge(safesearch_eng, on=["country_or_area", "iso_a3"], suffixes=(["_local", "_eng"]))

In [19]:
display(merged)

Unnamed: 0,country_or_area,percentage_racy_bool_local,iso_a3,percentage_racy_bool_eng
0,Colombia,0.27,COL,0.34
1,Philippines,0.27,PHL,0.19
2,Dominica,0.22,DMA,0.41
3,Czechia,0.12,CZE,0.18
4,Thailand,0.12,THA,0.16
5,France,0.1,FRA,0.08
6,Brazil,0.09,BRA,0.41
7,Russian Federation,0.07,RUS,0.11
8,Germany,0.06,DEU,0.05
9,Ukraine,0.0,UKR,0.17


In [20]:
merged.to_csv("../output/dataset/comparison-local-eng.csv", index=False)