# Calculate unknown unknowns
---

In [None]:
project_path = "/home/jupyter"
import os
import sys
sys.path.append(project_path)

from google.cloud import bigquery
from google.cloud import storage
import geopandas as gpd
from plotly import graph_objs as go
import seaborn as sns
import time
import itertools

import numpy as np
import pandas as pd
import plotly.express as px
import cmath as math

from fintrans_toolbox.src import bq_utils as bq

In [None]:
client = bigquery.Client()

# specify filters
here you can specify which dataset you wish to produce. 

In [None]:
table = '`ons-fintrans-data-prod.fintrans_visa.retail_performance_high_streets_towns-2024-01-16T17_11_35`'
time_period = 'Quarter'
location_level = 'POSTAL_AREA'

# Functions
---

In [None]:
def create_sql_query(table, time_period, 
                     cardholder_location_level, 
                     merchant_location_level):
    '''Create a SQL query for a given time period, cardholder location level
    and merchant location level.
    
    Args:
        time_period (str): 'Month' or 'Quarter'
        cardholder_location_level (str): 'POSTAL_DISTRICT', 'POSTAL AREA' 
            or 'All'
        merchant_location_level (str): 'POSTAL_DISTRICT', 'POSTAL AREA' 
            or 'All'
    Returns:
        Str: a string containing the desired SQL query
    '''
    time_period_values = ['Month', 'Quarter']
    if time_period not in time_period_values:
        raise ValueError(
            f'Argument time_period must be one of {time_period_values}'
        )
    location_values = ['POSTAL_DISTRICT', 'POSTAL_AREA', 'All']
    if cardholder_location_level not in location_values:
        raise ValueError(
            f'Argument cardholder_location_level must be one of {location_values}'
        )
    if merchant_location_level not in location_values:
        raise ValueError(
            f'Argument merchant_location_level must be one of {location_values}'
        )
    
    sql = f'''SELECT DISTINCT time_period_value, 
           cardholder_location, 
           merchant_location, 
           sum(spend) as sum_spend 
           FROM {table} 
           WHERE time_period = '{time_period}' 
           AND cardholder_location_level = '{cardholder_location_level}' 
           AND merchant_location_level = '{merchant_location_level}' 
           AND mcg = 'All' 
           GROUP BY time_period_value, cardholder_location, merchant_location 
           ORDER BY time_period_value, cardholder_location'''
    return sql

In [None]:
def read_in_data(sql):
    '''Read in data. Convert the time_period_value to pd.Categorical
    datatype to save memory.
    
    Args: 
        sql (str): SQL query to select the desired data.
    
    Returns:
        pd.DataFrame
    '''
    
    data = bq.read_bq_table_sql(client, sql)
    data['time_period_value'] = pd.Categorical(data['time_period_value'])
    
    return data

In [None]:
def remove_postal_areas(df):
    '''Function to remove locations that have been incorrectly labelled as a 
    given postal district from the dataset based on the NSPL dataset.

    Args:
        df (pd.DataFrame)

    Returns:
        pd.DataFrame
    '''
    df = df.copy()
    
    df['numbers'] =  (
        df['merchant_location'].apply(lambda x: sum(c.isdigit() for c in x))
    )
    postal_areas = (
        (df['merchant_location'] != 'UNKNOWN') 
        & (df['merchant_location'] != 'All')
        & (df['numbers'] == 0)
    )
    df.loc[postal_areas,['merchant_location']] = 'UNKNOWN'
    df = df.drop('numbers', axis = 1)
    
    df = df.groupby(
        ['time_period_value', 
        'cardholder_location', 
        'merchant_location'],
        observed=True
    )['sum_spend'].sum().reset_index()
    
    return(df)

In [None]:
def set_locations_to_categorical(df):
    '''Convert the locations from strings to PandasCatagorical datatype. This 
    is to save memory.
    
    Args: 
        df (pd.DataFrame)
    
    Returns:
     pd.DataFrame
    '''
    
    df_map = df.copy()
    # convert location strings to pandas.Catagorical
    df_map['cardholder_location'] = df_map['cardholder_location'].astype('category')
    df_map['merchant_location'] = df_map['merchant_location'].astype('category')

    # ensure both columns share the same mapping
    category_mapping = df_map['cardholder_location'].cat.categories
    df_map['merchant_location'] = df_map['merchant_location'].cat.set_categories(category_mapping)
    
    return df_map

In [None]:
def create_location_mapping_columns(df):
    '''Create columns populated with the numerical representations of the 
    location columns.
    
    Args: df (pd.DataFrame)
    
    Returns:
        pd.DataFrame
    '''
    df_map = df.copy()
 
    df_map['cardholder_location_code'] = df_map['cardholder_location'].cat.codes
    df_map['merchant_location_code'] = df_map['merchant_location'].cat.codes
   
    return df_map

In [None]:
def create_lookup_table(df):
    '''create a lookup table which maps the postal districts to their numerical
    representations.
    
    Args: 
        df (pd.DataFrame): dataframe with location columns in pd.Categorical
            format
    
    Returns: 
        pd.DataFrame
    '''
    category_mapping = df['cardholder_location'].cat.categories

    lookup_table = pd.DataFrame({
        'postal_district': category_mapping,
        'postal_district_code': range(len(category_mapping))
    })
    return lookup_table

In [None]:
def import_data(sql):
    '''Read in data. Convert the time_period_value to a datetime for easier 
    manipulation and to save memory. If importing postal district data, remove 
    postal areas that have been incorrectly labelled.
    
    Args:
        sql (str): sql query for the data you are reading in.
    Returns:
        pd.DataFrame
   
    '''
    
    df = read_in_data(sql)
    if 'POSTAL_DISTRICT' in sql:
        df = remove_postal_areas(df)
        print('''remove_postal_areas used. YOU SHOULD NOT SEE THIS IF YOU 
        ARE LOOKING AT POSTAL AREAS!!!''')
    else:
        print('''remove_postal_areas function NOT used. You SHOULD see this if
        you are looking at postal areas or at the all_all breakdown.''')
    return(df)

In [None]:
def calculate_unknowns_stage_1(cardholder_merchant, cardholder_all):
    ''' compare the data where we have cardholder location and merchant
    location to the data where we have cardholder location but merchant 
    location is set to 'All'
    '''
    
    grouped_cardholder_merchant =  cardholder_merchant.groupby(
        ['time_period_value', 'cardholder_location'],
        observed=True
    )['sum_spend'].sum().reset_index()
    
    cardholder_all = cardholder_all.drop(
    columns='merchant_location'
    )
    
    unknown = cardholder_all.merge(
        grouped_cardholder_merchant, 
        on=('time_period_value', 'cardholder_location'), 
        how='outer', 
        suffixes=('_cardholder_all', '_cardholder_merchant')
    )
    
    # for the case below we have data in the district_district dataframe but not the 
    # district_all dataframe. so there are NO unknowns being added to the 
    # district_district dataframe from the district_all data (these are cases where 
    # we have more data in the district_district data). 
    # here we swap the data between columns so we are keeping this data when
    # using the calculations in the following cells.
    mask = unknown['sum_spend_cardholder_all'].isnull()
    unknown.loc[
        mask, 
        ['sum_spend_cardholder_all', 'sum_spend_cardholder_merchant']
    ] = unknown.loc[
        mask,
        ['sum_spend_cardholder_merchant', 'sum_spend_cardholder_all']
    ].values
    
    # after the swap we can fill the NaN values with '0'. so we will be subtracting 
    # 0 in the calculations in the following cells.
    # We fill with a '0' because the postal area is present 
    # in the district_all dataframe but not the district_district dataframe. 
    # so we can say that there is '0' known spend in the district_district 
    # dataframe.
    # We also have the data above, where the columns have been switched, this 
    # is explained above.
    unknown['sum_spend_cardholder_merchant'] = (
        unknown['sum_spend_cardholder_merchant'].fillna(0)
    )
    
    unknown['sum_spend'] = (
        unknown['sum_spend_cardholder_all'] - 
        unknown['sum_spend_cardholder_merchant']
    )
    unknown['merchant_location'] = 'UNKNOWN'

    unknown = unknown[['time_period_value', 
                       'cardholder_location', 
                       'merchant_location',
                       'sum_spend']]

    combined_data = pd.concat([cardholder_merchant, unknown], axis=0, ignore_index=True)
    
    combined_data =  combined_data.groupby(
        ['time_period_value', 'cardholder_location', 'merchant_location'],
        observed=True
    )['sum_spend'].sum().reset_index()
    
    return combined_data

In [None]:
def calculate_unknowns_stage_2(stage_1, all_all):
    '''compare the dataframe created from stage 1 of the calculated unknows 
    with the data where we set cardholder location to all and merchant
    location to all.
    '''
    
    stage_1_time_grouped = (
        stage_1.groupby(
            'time_period_value', 
            observed=True
        )['sum_spend'].sum().reset_index()
    )
    
    unknowns = stage_1_time_grouped.merge(
        all_all, 
        on='time_period_value',
        suffixes=('_district_district', '_all_all')
    )
    
    unknowns['sum_spend'] = (
        unknowns['sum_spend_all_all'] 
        - unknowns['sum_spend_district_district']
    )
    
    unknowns['cardholder_location'] = 'UNKNOWN'
    unknowns['merchant_location'] = 'UNKNOWN'
    
    unknowns = unknowns[['time_period_value', 
                         'cardholder_location', 
                         'merchant_location', 
                         'sum_spend']]
    
    combined_data = pd.concat([stage_1, unknowns], 
                              axis=0, 
                              ignore_index=True)
    
    combined_data =  combined_data.groupby(
        ['time_period_value', 'cardholder_location', 'merchant_location'],
        observed=True
    )['sum_spend'].sum().reset_index()
    
    return combined_data

In [None]:
def calculate_unknowns_stage_3(stage_2, all_merchant):
    '''compare the output of calculate unknowns stage 2 to fill in the extra
    information from the data where we have merchant location but we set 
    cardholder location to 'All'
    '''
    
    stage_2_time_grouped = stage_2.groupby(
        ['time_period_value', 'merchant_location'],
        observed=True
    )['sum_spend'].sum().reset_index()
    
    stage_2_time_grouped['cardholder_location'] = 'All'
    
    all_merchant_merged = all_merchant.merge(
        stage_2_time_grouped,
        on=['time_period_value', 'cardholder_location', 'merchant_location'],
        how='outer',
        suffixes=('_all_merchant', '_stage_2')
    )
    
    missing_rows = all_merchant_merged[
        all_merchant_merged['sum_spend_stage_2'].isnull()
    ]
    missing_rows_time_grouped = missing_rows.groupby(
        'time_period_value',
        observed=True
    )['sum_spend_all_merchant'].sum().reset_index()
    
    unknown_unknown = stage_2[
        (stage_2['cardholder_location'] == 'UNKNOWN')
        & (stage_2['merchant_location'] == 'UNKNOWN')
    ]

    comparisson_missing_rows = unknown_unknown.merge(
        missing_rows_time_grouped,
        on=['time_period_value'],
        suffixes=('_stage_2', '_all_merchant')
    )
    
    comparisson_missing_rows['sum_spend_stage_2'] = (
        comparisson_missing_rows['sum_spend']
        - comparisson_missing_rows['sum_spend_all_merchant']
    )
    
    unknown_unknown_updated = comparisson_missing_rows[
        ['time_period_value', 
         'cardholder_location', 
         'merchant_location', 
         'sum_spend_stage_2']
    ].copy()
    unknown_unknown_updated.columns = (
        unknown_unknown_updated.columns.str.replace('_stage_2', '')
    )
    
    stage_2_unknown_updated = stage_2.merge(
        unknown_unknown_updated,
        on=['time_period_value',
            'cardholder_location',
            'merchant_location'],
        how='outer',
        suffixes=('_original', '_updated')
    )

    stage_2_unknown_updated['sum_spend'] = (
        stage_2_unknown_updated['sum_spend_updated'].fillna(
            stage_2_unknown_updated['sum_spend_original']
        )
    )

    stage_2_unknown_updated.drop(
        columns=['sum_spend_original', 'sum_spend_updated'], inplace=True
    )
    
    new_unknown_merchant = missing_rows[
        ['time_period_value', 
         'cardholder_location', 
         'merchant_location', 
         'sum_spend_all_merchant']
    ].copy()
    
    new_unknown_merchant['cardholder_location'] = 'UNKNOWN'
    new_unknown_merchant = new_unknown_merchant.rename(
        columns={'sum_spend_all_merchant': 'sum_spend'}
    )

    complete_dataset = pd.concat(
        [stage_2_unknown_updated, new_unknown_merchant],
         axis=0,
         ignore_index=True
    )
    
    return complete_dataset

In [None]:
def calculate_unknowns(cardholder_district, cardholder_all, all_merchant, all_all):
    '''Run all the stages of the unknown calculations to fill in all the missing
    unknown unknowns.
    '''
    
    stage_1 = calculate_unknowns_stage_1(cardholder_district, cardholder_all)
    stage_2 = calculate_unknowns_stage_2(stage_1, all_all)
    complete_dataset = calculate_unknowns_stage_3(stage_2, all_merchant)
    
    return complete_dataset

# Data
---

**For the dataframe variable names will use the following convention thoughout:** 

* **cardholder_merchant** signifies we have cardholder location data at location level and merchant location data at a location level. 
* **cardholder_all** signifies we have cardholder location data at a location level but the merchant location is set to 'all'.
* **all_merchant** signifies we have set cardholder location to 'All' but the merchant location is at a location level.
* **all_all** signifies we have set cardholder and merchant locations to 'All'.

## SQL

In [None]:
sql_cardholder_merchant = create_sql_query(
    table=table, 
    time_period=time_period, 
    cardholder_location_level = location_level,
    merchant_location_level = location_level
)

In [None]:
sql_cardholder_all = create_sql_query(
    table=table, 
    time_period=time_period, 
    cardholder_location_level = location_level,
    merchant_location_level = 'All'
)

In [None]:
sql_all_all = create_sql_query(
    table=table, 
    time_period=time_period, 
    cardholder_location_level = 'All',
    merchant_location_level = 'All'
)

In [None]:
sql_all_merchant = create_sql_query(
    table=table, 
    time_period=time_period, 
    cardholder_location_level = 'All',
    merchant_location_level = location_level
)

## Create dataframes (with mislabelled merchant postal districts converted to unknowns)

In [None]:
all_all = import_data(sql_all_all)
all_merchant = import_data(sql_all_merchant)
cardholder_all = import_data(sql_cardholder_all)
cardholder_merchant = import_data(sql_cardholder_merchant)

# Calculate unknown unknowns
---

In [None]:
complete_dataset = calculate_unknowns(
    cardholder_merchant, 
    cardholder_all, 
    all_merchant, 
    all_all
)

In [None]:
complete_dataset

## CHECKS

In [None]:
sum_complete = complete_dataset['sum_spend'].sum()
sum_all = all_all['sum_spend'].sum()
sum_difference = round(sum_all - sum_complete)
if sum_difference != 0:
    print(f'''FAILED. 
    There is a difference between the sum of the complete dataset and the original all_all data. 
    The difference is: {sum_difference}''')
else: 
    print(f'''PASSED.
    the sums of the complete dataset and all_all dataset match (to the nearest integer)''')

## Export

In [None]:
# Code to create folder
path = '/home/jupyter/ft_articles/outputs/article_march_2024/'

if not os.path.exists(path):
    os.makedirs(path)
    print("Directory created successfully!")
else:
    print("Directory already exists!")
    

In [None]:
if (sum_difference == 0):
    complete_dataset.to_csv(path + f'DO_NOT_EXPORT_FROM_GCP_{location_level}_{time_period}_spend.csv', index = False)
else:
    print('''NOT SAVED
    sum all_all != sum complete_dataset.
    INVESTIGATE THIS ISSUE AND MAKE SURE THE SUMS EQUAL EACH OTHER BEFORE SAVING!!!''')

# Unknowns Stats
---

In [None]:
def print_unknown_stats(cardholder_merchant, all_all, complete_dataset):
    cardholder_unknown_original = cardholder_merchant[
        (cardholder_merchant['cardholder_location'] == 'UNKNOWN')
        & (cardholder_merchant['merchant_location'] != 'UNKNOWN')
    ]['sum_spend'].sum()
    
    cardholder_unknown_complete = complete_dataset[
        (complete_dataset['cardholder_location'] == 'UNKNOWN')
        & (complete_dataset['merchant_location'] != 'UNKNOWN')
    ]['sum_spend'].sum()

    merchant_unknown_original = cardholder_merchant[
        (cardholder_merchant['merchant_location'] == 'UNKNOWN')
        & (cardholder_merchant['cardholder_location'] != 'UNKNOWN')
    ]['sum_spend'].sum()
    
    merchant_unknown_complete = complete_dataset[
        (complete_dataset['merchant_location'] == 'UNKNOWN')
        & (complete_dataset['cardholder_location'] != 'UNKNOWN')
    ]['sum_spend'].sum()

    both_unknown_original = cardholder_merchant[
        (cardholder_merchant['cardholder_location'] == 'UNKNOWN')
        & (cardholder_merchant['merchant_location'] == 'UNKNOWN')
    ]['sum_spend'].sum()
    
    both_unknown_complete = complete_dataset[
        (complete_dataset['cardholder_location'] == 'UNKNOWN')
        & (complete_dataset['merchant_location'] == 'UNKNOWN')
    ]['sum_spend'].sum()
      
    sum_all_unknowns_original = (
        cardholder_unknown_original
        + merchant_unknown_original
        + both_unknown_original
    )
    
    sum_all_unknowns_complete = (
        cardholder_unknown_complete
        + merchant_unknown_complete
        + both_unknown_complete
    )
    sum_unknown_unknowns = (
        all_all['sum_spend'].sum() 
        - cardholder_merchant['sum_spend'].sum()
    )   
    
    perc_cardholder_unknown_original = (
        cardholder_unknown_original /
        all_all['sum_spend'].sum() * 100
    )
    perc_cardholder_unknown_complete = (
       cardholder_unknown_complete /
       all_all['sum_spend'].sum() * 100
    )
    
    perc_merchant_unknown_original = (
        merchant_unknown_original /
        all_all['sum_spend'].sum() * 100
    )
    perc_merchant_unknown_complete = (
        merchant_unknown_complete /
        all_all['sum_spend'].sum() * 100
    )
    
    perc_both_unknown_original = (
        both_unknown_original /
        all_all['sum_spend'].sum() * 100
    )
    perc_both_unknown_complete = (
        both_unknown_complete /
        all_all['sum_spend'].sum() * 100
    )
    
    perc_all_unknowns_original = (
        perc_cardholder_unknown_original
        + perc_merchant_unknown_original
        + perc_both_unknown_original
    )
    perc_all_unknowns_complete = (
        perc_cardholder_unknown_complete
        + perc_merchant_unknown_complete
        + perc_both_unknown_complete
    )
   
    perc_unknown_unknowns = (
        sum_unknown_unknowns / 
        all_all['sum_spend'].sum() * 100
    )
    perc_all_unknowns = (
        (sum_all_unknowns_original + sum_unknown_unknowns)
        / all_all['sum_spend'].sum() * 100
    )
    complete_dataset_unknowns = complete_dataset[
        (complete_dataset['cardholder_location'] == 'UNKNOWN')
        | (complete_dataset['merchant_location'] == 'UNKNOWN')
    ]
    total_perc_unknown = (
        complete_dataset_unknowns['sum_spend'].sum() 
        / complete_dataset['sum_spend'].sum() * 100
    )
    print(f'Unknown Stats at a {location_level} and {time_period}ly breakdown:\n')
    print('Percentage unknown stats:')
    print(f'    cardholder unknown:')
    print(f'        original: {perc_cardholder_unknown_original:.2f}%')
    print(f'        complete: {perc_cardholder_unknown_complete:.2f}%')      
    print(f'    merchant unknown:')
    print(f'        original: {perc_merchant_unknown_original:.2f}%')
    print(f'        complete: {perc_merchant_unknown_complete:.2f}%')  
    print(f'    both unknown:')
    print(f'        original: {perc_both_unknown_original:.2f}%')
    print(f'        complete: {perc_both_unknown_complete:.2f}%')
    print(f'    all unknowns:')
    print(f'        original: {perc_all_unknowns_original:.2f}%')
    print(f'        complete: {perc_all_unknowns_complete:.2f}%%')
    print('')
    print(f'    unknown unknowns: {perc_unknown_unknowns:.2f}%')
    print('')
    print(f'''Total percentage unknown. This is any row which contains an unknown, 
    plus unknown unknowns, calculated using ORIGINAL DATA:''')
    print(f'        {perc_all_unknowns:.2f}%')
    print('')
    print(f'''Total percentage unknown. This is any row which contains an unknown, 
    plus unknown unknowns, calculated using COMPLETE_DATASET:''')
    print(f'        {total_perc_unknown:.2f}%')
    print('')
    print('Sum unknowns:')
    print(f'    unknown cardholder location: £{cardholder_unknown_complete:,}')
    print(f'    unknown merchant location: £{merchant_unknown_complete:,}')
    print(f'    unknown cardholder and merchant location: £{both_unknown_complete:,}')
    print(f'    all unknowns: £{sum_all_unknowns_complete:,}')

In [None]:
print_unknown_stats(cardholder_merchant, all_all, complete_dataset)