# Adjusting for cardholders

The method created so far is to have an adjustment table for each table and then this is read in and joined to the table that you are working with allowing you to create an adjusted spend/transcations value.

Further development will be required to see if we want to use one adjustment table or if we keep having seperate adjustment tables for each table. Moreover, how do we deal with different specifications, and the drop in cardholders over covid. These tables will have to be amended. 

The adjustment tables will be stored in fin_wip_notebook and read in accordingly. 


In [None]:
project_path = "/home/jupyter"
import sys

sys.path.append(project_path)
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from google.cloud import bigquery

from fintrans_toolbox.src import bq_utils as bq
from fintrans_toolbox.src import table_utils as t

In [None]:
client = bigquery.Client()

In [None]:
# We first need to create our adjustment tables that we will bring in every time we want to create an adjusted value
# we do one for each table and each month/quarter then join on the date

# THE ADJUSTMENT TABLES will have to be amendended as they don't account for

In [None]:
def adjusted_rphst(df, time_period):
    """
    Gets data from BigQuery and saves to Pandas DataFrame

    Args:
       - df1: the dataframe of interest that you have read in
       - table: which table the dataframe is on so we have a reference for adjustment table to bring in
       - time_period: which time_period the dataframe is on so we have a reference for adjustment table to bring in
    Returns:
       - the dataframe with adjusted spend/transactions if that variable exists
    """
    
    if time_period in ["Quarter", "quarter", "q"]:
        time_period = "Quarter"
    if time_period in ["Month", "month", "m"]:
        time_period = "M"
        
    client = bigquery.Client()
    df_adj = t.read_retail_performance_high_streets_towns(
                                                            client,
                                                            time_period,
                                                            cardholder_location_level="All",
                                                            merchant_location_level="All",
                                                            mcg="All",
                                                            cardholder_location="",
                                                            merchant_location="",
                                                        )

    df_adj = t.create_date_time(df_adj)
    df_adj = df_adj.sort_values(by = ["time_period" , "date_time"])

    df_adj["index"] = df_adj.groupby(["time_period"])[
        "cardholders"
    ].transform(lambda x: x / x.iloc[0])

    
#    df_adj = bq.read_full_bq_table(
 #       client,
 #       f"ons-fintrans-analysis-prod.fin_wip_notebook.",
 #   )
    # link on datetime first
    try:
        df1 = df.merge(
            df_adj[["date_time", "index"]], on="date_time", how="left"
        )
    except Exception as e:
        print(
            f"{e}: using time_period_value instead of date_time, consider converting to date_time"
        )
        df1 = df.merge(
            df_adj[["time_period_value", "index"]],
            on="time_period_value",
            how="left",
        )
    try:
        df1["idx_spend"] = df1["spend"] / df1["index"]
    except Exception as e:
        print(f"{e}")
    try:
        df1["idx_transactions"] = df1["transactions"] / df1["index"]
    except Exception as e:
        print(f"{e}")

    return df1
## TODO: ADD catch incase merge creates extra rows. Should be 1to1

In [None]:
#test adjust rphst
df = t.read_retail_performance_high_streets_towns(
    client,
    time_period="Month",
    cardholder_location_level="POSTAL_AREA",
    merchant_location_level="All",
    mcg="All",
    cardholder_location="",
    merchant_location="",
)

df2 = adjusted_rphst(df, "Month")
df2


In [None]:
#Graph actual and adjusted spend
df3 = df2.loc[df2.cardholder_location == "LD"]
plt.plot(df3["date_time"],df3["spend"])
plt.plot(df3["date_time"], df3["idx_spend"])
plt.legend(loc='best')
plt.show()

In [None]:
def adjusted_spoc(df, 
                  time_period = "", 
                  cardholder_origin = "",
                  merchant_channel = ""):
    """
    Gets data from BigQuery and saves to Pandas DataFrame

    Args:
       - df: the dataframe to be adjusted
       - time_period: which time_period the dataframe is on so we have a reference for adjustment table to bring in. Defaults to any, where month and quarter will br included.
       - cardholder_origin: "All", "International Cardholder" or "United Kingdom". Defaults to any, where all of these will be included.
       - Merchant channel: "All", "Online" or "Face to Face". Defaults to any, where all of these will be included.
    Returns:
       - the dataframe with adjusted spend/transactions if that variable exists
    """
    
    if time_period in ["Quarter", "quarter", "q"]:
        time_period = "Quarter"
    if time_period in ["Month", "month", "m"]:
        time_period = "Month"
    
    sql = "SELECT * FROM `ons-fintrans-data-prod.fintrans_visa.spend_origin_and_channel` WHERE (cardholder_origin = 'UNITED KINGDOM' and cardholder_location = 'All' and mcg = 'All') or (cardholder_origin = 'International Cardholder'  and mcg = 'All') or (cardholder_origin = 'All'  and mcg = 'All')"
    client = bigquery.Client()
    df_adj = bq.read_bq_table_sql(client, sql)
    
    if time_period != "":
        df_adj = df_adj.loc[(df_adj.time_period.str.upper() == time_period.upper())]
    elif cardholder_origin != "":
        df_adj = df_adj.loc[(df_adj.cardholder_origin.str.upper() == cardholder_origin.upper())]
    elif merchant_channel != "":
        df_adj = df_adj.loc[(df_adj.merchant_channel.str.upper() == merchant_channel.upper())]
    else: pass
        
        
    df_adj = df_adj.groupby(['time_period', 
                     'time_period_value', 
                     'cardholder_origin',    
                     'cardholder_origin_country', 
                     'mcg', 
                     'mcc', 
                     'merchant_channel']).sum(['spend', 'transactions', 'cardholders']).reset_index()

    df_adj = t.create_date_time(df_adj)
    df_adj = df_adj.sort_values(by = ["cardholder_origin","cardholder_origin_country", "merchant_channel","time_period" , "date_time"])

    df_adj["index"] = df_adj.groupby(["cardholder_origin","cardholder_origin_country","merchant_channel","time_period"])[
        "cardholders"
    ].transform(lambda x: x / x.iloc[0])
    
    df_adj.loc[df_adj['cardholder_origin'] == "UNITED KINGDOM", 'merged_country'] = "UNITED KINGDOM"
    df_adj.loc[df_adj['cardholder_origin'] != "UNITED KINGDOM", 'merged_country'] = df_adj.cardholder_origin_country
    
    if set([ "cardholder_origin_country","cardholder_origin"]).issubset(df.columns):
            df.loc[df['cardholder_origin'] == "UNITED KINGDOM", 'merged_country'] = "UNITED KINGDOM"
            df.loc[df['cardholder_origin'] != "UNITED KINGDOM", 'merged_country'] = df.cardholder_origin_country  
            df = df.drop(columns = [ "cardholder_origin_country","cardholder_origin"])
    elif cardholder_origin == "":
        print("cardholder origin needs to be set if cardholder_origin_country and cardholder_origin are not on the dataframe")
    else: df = df.assign(merged_country = cardholder_origin)
 
    # If cardholder origin "internation" and cardholder issueing country doesn't exist, fail
    if cardholder_origin == "International Cardholder" and not any(df.columns == "cardholder_origin_country"):
        print("cardholder_origin = 'International Cardholder' but cardholder_origin_country column does not exist. Need to match on country for international")
        
    elif (merchant_channel == "" and not any(df_adj.columns == "merchant_channel")):
        print("merchant channel is not specified, but there is no column for merchant channel so merge will not work")
    
    else:
        cols = ['time_period','merged_country','merchant_channel']
        merge_on = list(set(cols) & set(df_adj.columns))
        # link on datetime, cardholder origin, issuing country and merchant channel first
    try:
              df1 = df.merge(df_adj[["date_time"] + merge_on + ["index"]], on = ["date_time"] + merge_on ,  how="left" )
              print(f"Merging on date_time, {merge_on}")
    except Exception as e:
                    print(
                        f"{e}: using time_period_value instead of date_time, consider converting to date_time"
                    )
                    print(f"Merging on time_period_value, {merge_on}")
                    df1 = df.merge(df_adj[["time_period_value"] + merge_on + ["index"]], on = ["time_period_value"] + merge_on ,  how="left" )
                          
    try:
           df1["idx_spend"] = df1["spend"] / df1["index"]
    except Exception as e:
            print(f"{e}")
    try:
           df1["idx_transactions"] = df1["transactions"] / df1["index"]
    except Exception as e:
        print(f"{e}")
        
    if len(df1) != len(df):
        print("output table is different legth to input table, check merge")

    return df1
    ## TODO: ADD catch incase merge creates extra rows. Should be 1to1
    ## TODO: ADD catch incase merge creates extra rows. Should be 1to1

In [None]:
merge_on = list(set(cols) & set(df_adj.columns))
merge_on

df = df.assign(merged_country = cardholder_origin)
df
#df1 = df.merge(df_adj[["time_period_value"] + merge_on + ["index"]], on = ["time_period_value"] + merge_on ,  how="left" )

#df1 = df.merge(df_adj[["date_time"] + merge_on + ["index"]], on = ["date_time"] + merge_on ,  how="left" )

In [None]:
    #merchant_channel = "Online"
    #cardholder_origin = "UNITED KINGDOM"
    sql = "SELECT * FROM `ons-fintrans-data-prod.fintrans_visa.spend_origin_and_channel` limit 500000 "
    df = bq.read_bq_table_sql(client, sql)
    client = bigquery.Client()
    df = df.loc[df.cardholder_origin == "International Cardholder"]
    df = df.drop(columns =["cardholder_origin", "cardholder_origin_country"])

    df1 = adjusted_spoc(df, merchant_channel = "Online",cardholder_origin = "International Cardholder")
    df1
    
    #df1 = df1.loc[df1.merged_country == "CHILE"]
    #df1

In [None]:
    merchant_channel = "Online"
    cardholder_origin = "International Cardholder"
    sql = "SELECT * FROM `ons-fintrans-data-prod.fintrans_visa.spend_origin_and_channel` limit 500000 "
    df = bq.read_bq_table_sql(client, sql)
    client = bigquery.Client()
    df = df.loc[df.cardholder_origin == "UNITED KINGDOM"]
    df = df.drop(columns =["cardholder_origin", "cardholder_origin_country"]) 
    if time_period in ["Quarter", "quarter", "q"]:
        time_period = "Quarter"
    if time_period in ["Month", "month", "m"]:
        time_period = "Month"
    
    sql = "SELECT * FROM `ons-fintrans-data-prod.fintrans_visa.spend_origin_and_channel` WHERE (cardholder_origin = 'UNITED KINGDOM' and cardholder_location = 'All' and mcg = 'All') or (cardholder_origin = 'International Cardholder'  and mcg = 'All') or (cardholder_origin = 'All'  and mcg = 'All')"
    client = bigquery.Client()
    df_adj = bq.read_bq_table_sql(client, sql)
    
    if time_period != "":
        df_adj = df_adj.loc[(df_adj.time_period.str.upper() == time_period.upper())]
    elif cardholder_origin != "":
        df_adj = df_adj.loc[(df_adj.cardholder_origin.str.upper() == cardholder_origin.upper())]
    elif merchant_channel != "":
        df_adj = df_adj.loc[(df_adj.merchant_channel.str.upper() == merchant_channel.upper())]
        
        
    df_adj = df_adj.groupby(['time_period', 
                     'time_period_value', 
                     'cardholder_origin',    
                     'cardholder_origin_country', 
                     'mcg', 
                     'mcc', 
                     'merchant_channel']).sum(['spend', 'transactions', 'cardholders']).reset_index()

    df_adj = t.create_date_time(df_adj)
    df_adj = df_adj.sort_values(by = ["cardholder_origin","cardholder_origin_country", "merchant_channel","time_period" , "date_time"])

    df_adj["index"] = df_adj.groupby(["cardholder_origin","cardholder_origin_country","merchant_channel","time_period"])[
        "cardholders"
    ].transform(lambda x: x / x.iloc[0])
    
    df_adj.loc[df_adj['cardholder_origin'] == "UNITED KINGDOM", 'merged_country'] = "UNITED KINGDOM"
    df_adj.loc[df_adj['cardholder_origin'] != "UNITED KINGDOM", 'merged_country'] = df_adj.cardholder_origin_country
    
    if set([ "cardholder_origin_country","cardholder_origin"]).issubset(df.columns):
            df.loc[df['cardholder_origin'] == "UNITED KINGDOM", 'merged_country'] = "UNITED KINGDOM"
            df.loc[df['cardholder_origin'] != "UNITED KINGDOM", 'merged_country'] = df.cardholder_origin_country  
            df = df.drop(columns = [ "cardholder_origin_country","cardholder_origin"])
    elif cardholder_origin == "":
        print("cardholder origin needs to be set if cardholder_origin_country and cardholder_origin are not on the dataframe")
    else: df = df.assign(merged_country = cardholder_origin)
 
    # If cardholder origin "international" and cardholder issueing country doesn't exist, fail
    if cardholder_origin == "International Cardholder" and not any(df.columns == "cardholder_origin_country"):
        print("cardholder_origin = 'International Cardholder' but cardholder_origin_country column does not exist. Need to match on country for international")
        
    elif (merchant_channel == "" and not any(df_adj.columns == "merchant_channel")):
        print("merchant channel is not specified, but there is no column for merchant channel so merge will not work")
    
    else:
        cols = ['time_period','merged_country','merchant_channel']
        merge_on = list(set(cols) & set(df_adj.columns))
        # link on datetime, cardholder origin, issuing country and merchant channel first
    try:
              df1 = df.merge(df_adj[["date_time"] + merge_on + ["index"]], on = ["date_time"] + merge_on ,  how="left" )
              print(f"Merging on date_time, {merge_on}")
    except Exception as e:
                    print(
                        f"{e}: using time_period_value instead of date_time, consider converting to date_time"
                    )
                    print(f"Merging on time_period_value, {merge_on}")
                    df1 = df.merge(df_adj[["time_period_value"] + merge_on + ["index"]], on = ["time_period_value"] + merge_on ,  how="left" )
df