In [None]:
project_path = "/home/jupyter"
import os
import sys

sys.path.append(project_path)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from google.cloud import bigquery

from fintrans_toolbox.src import bq_utils as bq
from fintrans_toolbox.src import table_utils as t


client = bigquery.Client()



In [None]:
#Total MCG F2F Quarterly

Total_MCGs_F2F = '''
WITH mcg_quarterly_spend AS (
  SELECT 
   time_period_value,
    SUM(spend) AS total_spend,
    mcg
  FROM `ons-fintrans-data-prod.fintrans_visa.spend_origin_and_channel` 
  WHERE time_period = 'Quarter'
  AND mcg != 'All'
  AND mcc = 'All'
  AND cardholder_origin_country = 'All' 
  AND cardholder_origin = 'UNITED KINGDOM' 
  AND merchant_channel = 'Face to Face'
  GROUP BY time_period_value, mcg
)

SELECT *
FROM mcg_quarterly_spend
ORDER BY time_period_value, mcg;
'''

# Run the query and load into a DataFrame
df_Total_MCGs_F2F = client.query(Total_MCGs_F2F).to_dataframe()

# Save to CSV
df_Total_MCGs_F2F.to_csv('Total_MCGs_F2F.csv', index=False)

print(df_Total_MCGs_F2F)

In [None]:
#Total MCG Online Quarterly

Total_MCGs_Online = '''
WITH mcg_quarterly_spend AS (
  SELECT 
    time_period_value,
    SUM(spend) AS total_spend,
    mcg
  FROM `ons-fintrans-data-prod.fintrans_visa.spend_origin_and_channel` 
  WHERE time_period = 'Quarter'
  AND mcg != 'All'
  AND mcc = 'All'
  AND cardholder_origin_country = 'All' 
  AND cardholder_origin = 'UNITED KINGDOM' 
  AND merchant_channel = 'Online'
  GROUP BY time_period_value, mcg
)

SELECT *
FROM mcg_quarterly_spend
ORDER BY time_period_value, mcg;
'''
# Run the query and load into a DataFrame
df_Total_MCGs_Online = client.query(Total_MCGs_Online).to_dataframe()

# Save to CSV
df_Total_MCGs_Online.to_csv('Total_MCGs_Online.csv', index=False)

print(df_Total_MCGs_Online)

In [None]:
# Summarise the data by UK Cardholder Spending All Quarterly --------------- Cardholders' Number Total Quarterly ---- TEST mcg = 'All'

UK_spending_by_mcg_All = '''SELECT time_period_value, cardholders, mcg, spend 
FROM `ons-fintrans-data-prod.fintrans_visa.spend_origin_and_channel` 
where time_period = 'Quarter' 
and mcg = 'All' 
and merchant_channel = 'All' 
and cardholder_origin_country = 'All' 
and cardholder_origin = 'UNITED KINGDOM' 
 
GROUP BY cardholders, mcg, 
time_period_value, spend 
ORDER BY time_period_value, mcg'''
df_by_mcg_All = bq.read_bq_table_sql(client, UK_spending_by_mcg_All)
df_by_mcg_All.head()

# Caculate UK Domestic Total Spending Quarterly

# Assuming df_by_mcg_All is the DataFrame returned from the BigQuery query
# Then group by 'time_period_value' and sum the 'spend' for each quarter

# Check if df_by_mcg_All is not None and has the expected columns
if df_by_mcg_All is not None and 'time_period_value' in df_by_mcg_All.columns and 'spend' in df_by_mcg_All.columns:
    # Group by quarter and sum the spend
    UK_spending_by_mcg_All = df_by_mcg_All.groupby('time_period_value')['cardholders'].sum().reset_index()
   
 # Rename the column
    UK_spending_by_mcg_All = UK_spending_by_mcg_All.rename(columns={'cardholders': 'total_cardholders'})
    print(UK_spending_by_mcg_All)
else:
    print("DataFrame is empty or missing required columns.")

    # Save the result to a CSV file
csv_filename = "Total_MCGs_Cardholder0.csv"
UK_spending_by_mcg_All.to_csv(csv_filename, index=False)

print(f"CSV file '{csv_filename}' has been created successfully.")

In [None]:
# Adjusted values for Online MCGs based on 201901 cardholders 

import pandas as pd

# Load the data from the CSV files
cardholders_df = pd.read_csv("Total_MCGs_Cardholder0.csv")
online_df = pd.read_csv("Total_MCGs_Online.csv")

# Merge the dataframes on 'time_period_value'
merged_df = pd.merge(online_df, cardholders_df, on='time_period_value', how='inner')


# Get the base cardholders value from 201901
base_cardholders = cardholders_df.loc[cardholders_df['time_period_value'] == '2019Q1', 'total_cardholders'].values[0]

# Calculate the adjusted online spend
merged_df['adjusted_Online_spend'] = (base_cardholders / merged_df['total_cardholders']) * merged_df['total_spend']

# Save the result to a new CSV file
merged_df[["time_period_value", "mcg", "adjusted_Online_spend"]].to_csv("Adjusted_Online_MCGs.csv", index=False)

print("The adjusted online spend data has been saved to 'Adjusted_Online_MCGs.csv'.")

# Display the first few rows of the updated dataframe

merged_df.head(40)  # Displays the first 40 rows



In [None]:
# Adjusted values for Face to Face MCGs based on 2019Q1 cardholders 

import pandas as pd

# Load the data from the CSV files
cardholders_df = pd.read_csv("Total_MCGs_Cardholder0.csv")
online_df = pd.read_csv("Total_MCGs_F2F.csv")


# Merge the dataframes on 'time_period_value'
merged_df = pd.merge(online_df, cardholders_df, on='time_period_value', how='inner')


# Get the base cardholders value from 2019Q1
base_cardholders = cardholders_df.loc[cardholders_df['time_period_value'] == '2019Q1', 'total_cardholders'].values[0]

# Calculate the adjusted online spend
merged_df['adjusted_F2F_spend'] = (base_cardholders / merged_df['total_cardholders']) * merged_df['total_spend']

# Save the result to a new CSV file
merged_df[["time_period_value", "mcg", "adjusted_F2F_spend"]].to_csv("Adjusted_F2F_MCGs.csv", index=False)

print("The adjusted F2F spend data has been saved to 'Adjusted_F2F_MCGs.csv'.")

# Display the first few rows of the updated dataframe

merged_df.head(40)  # Displays the first 40 rows



In [None]:
# Indexed base on 2019 average = 100 ---------- RIGHT INDEXED TABLE
# Indexed Spend UK All ----------- mcg = 'All' ---------- Face to Face 
# Indexed card spending data (average 2019 equals 100) is calculated :
# Spend=(Quarter Spend / Average Spend in 2019) × 100
#Indexed Spend = (UK_spend_All / Average UK Spend in 2019 (which is 2019Q1 - 2019Q4 ave)) × 100

import pandas as pd

# Read the original CSV file with adjusted F2F spend
df = pd.read_csv("Adjusted_F2F_MCGs.csv")

# Filter rows for 2019Q1 to 2019Q4
df_2019 = df[df['time_period_value'].isin(["2019Q1", "2019Q2", "2019Q3", "2019Q4"])]

# Calculate the base average for each MCG
base_avg_per_mcg = df_2019.groupby('mcg')['adjusted_F2F_spend'].mean()

# Function to compute indexed spend with base average = 100 for each MCG
def compute_indexed(row):
    base_avg = base_avg_per_mcg.get(row['mcg'], None)
    if base_avg and base_avg != 0:
        return (row['adjusted_F2F_spend'] / base_avg) * 100
    else:
        return None

# Apply the function to compute the indexed spend
df['Indexed Spend F2F'] = df.apply(compute_indexed, axis=1)

# Save the updated DataFrame to a new CSV file
df.to_csv("Indexed_F2F_MCGs.csv", index=False)

print("The file has been created with indexed spend starting from 100 for each MCG.")



In [None]:
# Diamond Periods 2021Q4 to 2025Q2 Bar Charts

import pandas as pd
import matplotlib.pyplot as plt

# Load the indexed spend data
df = pd.read_csv("Indexed_F2F_MCGs.csv")

# Define the three periods of interest
periods = [
    ("2021Q4", "2022Q1", "2022Q2"),
    ("2022Q4", "2023Q1", "2023Q2"),
    ("2023Q4", "2024Q1", "2024Q2"),
    ("2024Q4", "2025Q1", "2025Q2")
]

# Store results for each period
results = {}

# Analyze each period
for q4, q1, q2 in periods:
    # Pivot the data for easier comparison
    pivot = df[df['time_period_value'].isin([q4, q1, q2])].pivot(index='mcg', columns='time_period_value', values='Indexed Spend F2F')
    
    # Drop rows with missing values
    pivot = pivot.dropna()
    
    # Calculate drop and rebound
    pivot['drop'] = pivot[q1] - pivot[q4]
    pivot['rebound'] = pivot[q2] - pivot[q1]
    pivot['total_change'] = abs(pivot['drop']) + abs(pivot['rebound'])
    
    # Sort by total change and select top 5
    top_mcg = pivot.sort_values(by='total_change', ascending=False).head(5)
    
    # Store for visualization
    results[f"{q4} to {q2}"] = top_mcg

    # Plotting
    plt.figure(figsize=(10, 6))
    bar_width = 0.35
    index = range(len(top_mcg))
    
    plt.bar(index, top_mcg['drop'], bar_width, label='Drop (Q4 to Q1)', color='skyblue')
    plt.bar([i + bar_width for i in index], top_mcg['rebound'], bar_width, label='Rebound (Q1 to Q2)', color='plum')
    
    plt.xlabel('MCG')
    plt.ylabel('Indexed Spend Change')
    plt.title(f'Top MCGs with Sharp Drop and Rebound: {q4} to {q2}')
    plt.xticks([i + bar_width / 2 for i in index], top_mcg.index, rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"Top_MCGs_{q4}_to_{q2}.png")

list(results.keys())



In [None]:
# Added Lines - Horizontal grid lines Face to Face

import pandas as pd
import matplotlib.pyplot as plt

# Load the indexed spend data
df = pd.read_csv("Indexed_F2F_MCGs.csv")

# Define the three periods of interest
periods = [
    ("2021Q4", "2022Q1", "2022Q2"),
    ("2022Q4", "2023Q1", "2023Q2"),
    ("2023Q4", "2024Q1", "2024Q2"),
    ("2024Q4", "2025Q1", "2025Q2")
]

# Analyze each period and update charts with grid lines
for q4, q1, q2 in periods:
    # Pivot the data for easier comparison
    pivot = df[df['time_period_value'].isin([q4, q1, q2])].pivot(index='mcg', columns='time_period_value', values='Indexed Spend F2F')
    
    # Drop rows with missing values
    pivot = pivot.dropna()
    
    # Calculate drop and rebound
    pivot['drop'] = pivot[q1] - pivot[q4]
    pivot['rebound'] = pivot[q2] - pivot[q1]
    pivot['total_change'] = abs(pivot['drop']) + abs(pivot['rebound'])
    
    # Sort by total change and select top 5
    top_mcg = pivot.sort_values(by='total_change', ascending=False).head(5)
    
    # Plotting with grid lines
    plt.figure(figsize=(10, 6))
    bar_width = 0.35
    index = range(len(top_mcg))
    
    plt.bar(index, top_mcg['drop'], bar_width, label='Drop (Q4 to Q1)', color='skyblue')
    plt.bar([i + bar_width for i in index], top_mcg['rebound'], bar_width, label='Rebound (Q1 to Q2)', color='plum')
    
    plt.xlabel('MCG')
    plt.ylabel('Indexed Spend F2F Change')
    plt.title(f'Top MCGs with Sharp Drop and Rebound: {q4} to {q2}')
    plt.xticks([i + bar_width / 2 for i in index], top_mcg.index, rotation=45)
    plt.legend()
    plt.grid(axis='y', linestyle='--', linewidth=0.5)
    plt.tight_layout()
    plt.savefig(f"Indexed_Spend_F2F_Change_{q4}_{q1}_{q2}.png")
    plt.close()

print("Updated charts with grid lines have been saved.")



In [None]:
# Horizontal grid lines Bar Charts

from PIL import Image
import matplotlib.pyplot as plt

# List of chart image filenames
chart_files = [
    "Indexed_Spend_F2F_Change_2021Q4_2022Q1_2022Q2.png",
    "Indexed_Spend_F2F_Change_2022Q4_2023Q1_2023Q2.png",
    "Indexed_Spend_F2F_Change_2023Q4_2024Q1_2024Q2.png",
    "Indexed_Spend_F2F_Change_2024Q4_2025Q1_2025Q2.png"
]

# Display each chart
for file in chart_files:
    img = Image.open(file)
    plt.figure(figsize=(10, 6))
    plt.imshow(img)
    plt.axis('off')
    plt.title(file.replace("_", " ").replace(".png", ""))
    plt.show()



In [None]:
# Diamond Pattern Change % 

import pandas as pd

# Load the CSV file
df = pd.read_csv("Indexed_F2F_MCGs.csv")

# Ensure time_period_value is treated as a string
df['time_period_value'] = df['time_period_value'].astype(str)

# Define the periods of interest (corrected to have 3 values per tuple)
periods = [
    ("2021Q4", "2022Q1", "2022Q2"),
    ("2022Q4", "2023Q1", "2023Q2"),
    ("2023Q4", "2024Q1", "2024Q2"),
    ("2024Q4", "2025Q1", "2025Q2")
]

# Prepare a list to collect results
results = []

# Iterate over each MCG
for mcg in df['mcg'].unique():
    df_mcg = df[df['mcg'] == mcg].set_index('time_period_value')
    for p1, p2, p3 in periods:
        if p1 in df_mcg.index and p2 in df_mcg.index and p3 in df_mcg.index:
            val1 = df_mcg.loc[p1, 'Indexed Spend F2F']
            val2 = df_mcg.loc[p2, 'Indexed Spend F2F']
            val3 = df_mcg.loc[p3, 'Indexed Spend F2F']
            drop_pct = ((val2 - val1) / val1) * 100
            rebound_pct = ((val3 - val2) / val2) * 100
            results.append({
                'mcg': mcg,
                'period': f"{p1}→{p2}→{p3}",
                'drop_%': round(drop_pct, 2),
                'rebound_%': round(rebound_pct, 2)
            })

# Create a DataFrame from the results
change_df = pd.DataFrame(results)

# Save to CSV
change_df.to_csv("MCG_Percentage_Changes_Diamond_F2F.csv", index=False)

# Output the path to the saved file
print("MCG_Percentage_Changes_Diamond_F2F.csv")



In [None]:
# Diamond Pattern Change Average for the three periods % 

import pandas as pd

# Load the percentage change data
df = pd.read_csv("MCG_Percentage_Changes_Diamond_F2F.csv")

# Group by MCG and calculate average drop and rebound percentages
avg_changes = df.groupby('mcg')[['drop_%', 'rebound_%']].mean().reset_index()

# Rename columns for clarity
avg_changes.rename(columns={'drop_%': 'avg_drop_%', 'rebound_%': 'avg_rebound_%'}, inplace=True)

# Save to a new CSV file
avg_changes.to_csv("MCG_Average_Percentage_Changes_F2F.csv", index=False)

# Output the path to the saved file
print("MCG_Average_Percentage_Changes_F2F.csv")



In [None]:
# Traceable Ranking Line chart with Figures for Indexed_F2F_MCGs.csv

import pandas as pd
import plotly.graph_objects as go

# Load the CSV file
file_path = 'Indexed_F2F_MCGs.csv'
df = pd.read_csv(file_path)

# Convert time_period_value to datetime for proper sorting
df['time_period_value'] = pd.to_datetime(df['time_period_value'], errors='coerce')

# Drop rows with invalid dates
df = df.dropna(subset=['time_period_value'])

# Find the latest time period
latest_period = df['time_period_value'].max()

# Get the latest Indexed Spend F2F values for each MCG
latest_values = df[df['time_period_value'] == latest_period][['mcg', 'Indexed Spend F2F']]
latest_values_sorted = latest_values.sort_values(by='Indexed Spend F2F', ascending=False)

# Create a line plot with traces sorted by latest values
fig = go.Figure()

for mcg in latest_values_sorted['mcg']:
    subset = df[df['mcg'] == mcg]
    fig.add_trace(go.Scatter(
        x=subset['time_period_value'],
        y=subset['Indexed Spend F2F'],
        mode='lines',
        name=f"{mcg} ({subset[subset['time_period_value'] == latest_period]['Indexed Spend F2F'].values[0]:.2f})"
    ))

# Update layout for better interactivity and increased height
fig.update_layout(
    title='Indexed Spend F2F Over Time by MCG Category',
    xaxis_title='Time Period',
    yaxis_title='Indexed Spend F2F',
    legend_title='MCG Category (Latest Value)',
    hovermode='x unified',
    height=800  # Increased height for better trace visibility
)

# Save the interactive plot as HTML
fig.write_html('interactive_indexed_spend_f2f_ranked.html')

# Show the plot
fig.show()



In [None]:
# Traceable & Selectable Ranking Line chart with Figures for Indexed_F2F_MCGs.csv

import pandas as pd
import plotly.express as px

# Load the CSV file
file_path = 'Indexed_F2F_MCGs.csv'
df = pd.read_csv(file_path)

# Ensure time_period_value is treated as string
df['time_period_value'] = df['time_period_value'].astype(str)

# Calculate 2019 average for each MCG
df_2019 = df[df['time_period_value'].isin(["2019Q1", "2019Q2", "2019Q3", "2019Q4"])]
mcg_2019_avg = df_2019.groupby('mcg')['Indexed Spend F2F'].mean()

# Normalize Indexed Spend F2F so that 2019 average = 100 for each MCG
df['Normalized Indexed Spend F2F'] = df.apply(
    lambda row: (row['Indexed Spend F2F'] / mcg_2019_avg[row['mcg']]) * 100 if row['mcg'] in mcg_2019_avg else None,
    axis=1
)

# Drop rows with missing normalized values
df = df.dropna(subset=['Normalized Indexed Spend F2F'])

# Create the interactive line chart
fig = px.line(
    df,
    x='time_period_value',
    y='Normalized Indexed Spend F2F',
    color='mcg',
    title='Normalized Indexed Spend F2F Over Time by MCG Category (2019 Avg = 100)'
)

# Hide all traces initially
fig.update_traces(visible='legendonly')

# Show only the first MCG trace by default
first_mcg = df['mcg'].unique()[0]
fig.for_each_trace(lambda trace: trace.update(visible=True) if trace.name == first_mcg else None)

# Update layout for better visibility
fig.update_layout(
    height=800,
    xaxis_title='Time Period',
    yaxis_title='Normalized Indexed Spend F2F (2019 Avg = 100)',
    legend_title='MCG Category',
    hovermode='x unified'
)

# Show the interactive plot
fig.show()



In [None]:
# Traceable & Selectable Ranking Line chart with Figures for Indexed_F2F_MCGs.csv

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Load the CSV file
file_path = 'Indexed_F2F_MCGs.csv'
df = pd.read_csv(file_path)

# Ensure time_period_value is treated as string
df['time_period_value'] = df['time_period_value'].astype(str)

# Calculate 2019 average for each MCG
df_2019 = df[df['time_period_value'].isin(["2019Q1", "2019Q2", "2019Q3", "2019Q4"])]
mcg_2019_avg = df_2019.groupby('mcg')['Indexed Spend F2F'].mean()

# Normalize Indexed Spend F2F so that 2019 average = 100 for each MCG
df['Normalized Indexed Spend F2F'] = df.apply(
    lambda row: (row['Indexed Spend F2F'] / mcg_2019_avg[row['mcg']]) * 100 if row['mcg'] in mcg_2019_avg else None,
    axis=1
)

# Drop rows with missing normalized values
df = df.dropna(subset=['Normalized Indexed Spend F2F'])

# Identify the latest quarter
latest_quarter = df['time_period_value'].max()

# Get the latest values for each MCG
latest_values = df[df['time_period_value'] == latest_quarter][['mcg', 'Normalized Indexed Spend F2F']]
latest_values_sorted = latest_values.sort_values(by='Normalized Indexed Spend F2F', ascending=True)

# Create the interactive line chart
fig_line = px.line(
    df,
    x='time_period_value',
    y='Normalized Indexed Spend F2F',
    color='mcg',
    title='Normalized Indexed Spend F2F Over Time by MCG Category (2019 Avg = 100)'
)

# Hide all traces initially
fig_line.update_traces(visible='legendonly')

# Show only the first MCG trace by default
first_mcg = df['mcg'].unique()[0]
fig_line.for_each_trace(lambda trace: trace.update(visible=True) if trace.name == first_mcg else None)

# Update layout for better visibility
fig_line.update_layout(
    height=700,
    xaxis_title='Time Period',
    yaxis_title='Normalized Indexed Spend F2F (2019 Avg = 100)',
    legend_title='MCG Category',
    hovermode='x unified'
)

# Create the ranking bar chart
fig_bar = go.Figure(go.Bar(
    x=latest_values_sorted['Normalized Indexed Spend F2F'],
    y=latest_values_sorted['mcg'],
    orientation='h',
    marker=dict(color='skyblue')
))

fig_bar.update_layout(
    title=f'MCG Category Ranking by Normalized Indexed Spend F2F in {latest_quarter}',
    xaxis_title='Normalized Indexed Spend F2F (2019 Avg = 100)',
    yaxis_title='MCG Category',
    height=800
)

# Show both figures
fig_line.show()
fig_bar.show()





In [None]:
# Calculate the percentage share of 'Indexed Spend F2F' for each MCG within each time period

import pandas as pd

# Load the CSV file
file_path = 'Indexed_F2F_MCGs.csv'
df = pd.read_csv(file_path)

# Calculate the percentage share of 'Indexed Spend F2F' for each MCG within each time period
df['Indexed Spend F2F %'] = df.groupby('time_period_value')['Indexed Spend F2F'].transform(lambda x: 100 * x / x.sum())

# Display the updated DataFrame with the new percentage column
df[['time_period_value', 'mcg', 'Indexed Spend F2F', 'Indexed Spend F2F %']].head()



In [None]:

# Save the updated DataFrame to a new CSV file
output_file = 'Indexed_F2F_MCGs_with_percent.csv'
df.to_csv(output_file, index=False)

print(f"Updated CSV with percentage share saved as '{output_file}'.")

In [None]:
# Indexed base on 2019 average = 100 ---------- RIGHT INDEXED TABLE

import pandas as pd

# Read the original CSV file with adjusted F2F spend
df = pd.read_csv("Adjusted_Online_MCGs.csv")

# Filter rows for 2019Q1 to 2019Q4
df_2019 = df[df['time_period_value'].isin(["2019Q1", "2019Q2", "2019Q3", "2019Q4"])]

# Calculate the base average for each MCG
base_avg_per_mcg = df_2019.groupby('mcg')['adjusted_Online_spend'].mean()

# Function to compute indexed spend with base average = 100 for each MCG
def compute_indexed(row):
    base_avg = base_avg_per_mcg.get(row['mcg'], None)
    if base_avg and base_avg != 0:
        return (row['adjusted_Online_spend'] / base_avg) * 100
    else:
        return None

# Apply the function to compute the indexed spend
df['Indexed Spend Online'] = df.apply(compute_indexed, axis=1)

# Save the updated DataFrame to a new CSV file
df.to_csv("Indexed_Online_MCGs.csv", index=False)

print("The file has been created with indexed spend starting from 100 for each MCG.")





In [None]:
# Traceable Ranking Line chart with Figures for Indexed_Online_MCGs.csv

import pandas as pd
import plotly.graph_objects as go

# Load the CSV file
file_path = 'Indexed_Online_MCGs.csv'
df = pd.read_csv(file_path)

# Convert time_period_value to datetime for proper sorting
df['time_period_value'] = pd.to_datetime(df['time_period_value'], errors='coerce')

# Drop rows with invalid dates
df = df.dropna(subset=['time_period_value'])

# Find the latest time period
latest_period = df['time_period_value'].max()

# Get the latest Indexed Spend F2F values for each MCG
latest_values = df[df['time_period_value'] == latest_period][['mcg', 'Indexed Spend Online']]
latest_values_sorted = latest_values.sort_values(by='Indexed Spend Online', ascending=False)

# Create a line plot with traces sorted by latest values
fig = go.Figure()

for mcg in latest_values_sorted['mcg']:
    subset = df[df['mcg'] == mcg]
    fig.add_trace(go.Scatter(
        x=subset['time_period_value'],
        y=subset['Indexed Spend Online'],
        mode='lines',
        name=f"{mcg} ({subset[subset['time_period_value'] == latest_period]['Indexed Spend Online'].values[0]:.2f})"
    ))

# Update layout for better interactivity and increased height
fig.update_layout(
    title='Indexed Spend Online Over Time by MCG Category',
    xaxis_title='Time Period',
    yaxis_title='Indexed Spend Online',
    legend_title='MCG Category (Latest Value)',
    hovermode='x unified',
    height=800  # Increased height for better trace visibility
)

# Save the interactive plot as HTML
fig.write_html('interactive_indexed_spend_online_ranked.html')

# Show the plot
fig.show()

In [None]:
# Traceable & Selectable Ranking Line chart with Figures for Indexed_Online_MCGs.csv

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Load the CSV file
file_path = 'Indexed_Online_MCGs.csv'
df = pd.read_csv(file_path)

# Ensure time_period_value is treated as string
df['time_period_value'] = df['time_period_value'].astype(str)

# Calculate 2019 average for each MCG
df_2019 = df[df['time_period_value'].isin(["2019Q1", "2019Q2", "2019Q3", "2019Q4"])]
mcg_2019_avg = df_2019.groupby('mcg')['Indexed Spend Online'].mean()

# Normalize Indexed Spend F2F so that 2019 average = 100 for each MCG
df['Normalized Indexed Spend Online'] = df.apply(
    lambda row: (row['Indexed Spend Online'] / mcg_2019_avg[row['mcg']]) * 100 if row['mcg'] in mcg_2019_avg else None,
    axis=1
)

# Drop rows with missing normalized values
df = df.dropna(subset=['Normalized Indexed Spend Online'])

# Identify the latest quarter
latest_quarter = df['time_period_value'].max()

# Get the latest values for each MCG
latest_values = df[df['time_period_value'] == latest_quarter][['mcg', 'Normalized Indexed Spend Online']]
latest_values_sorted = latest_values.sort_values(by='Normalized Indexed Spend Online', ascending=True)

# Create the interactive line chart
fig_line = px.line(
    df,
    x='time_period_value',
    y='Normalized Indexed Spend Online',
    color='mcg',
    title='Normalized Indexed Spend Online Over Time by MCG Category (2019 Avg = 100)'
)

# Hide all traces initially
fig_line.update_traces(visible='legendonly')

# Show only the first MCG trace by default
first_mcg = df['mcg'].unique()[0]
fig_line.for_each_trace(lambda trace: trace.update(visible=True) if trace.name == first_mcg else None)

# Update layout for better visibility
fig_line.update_layout(
    height=700,
    xaxis_title='Time Period',
    yaxis_title='Normalized Indexed Spend Online (2019 Avg = 100)',
    legend_title='MCG Category',
    hovermode='x unified'
)

# Create the ranking bar chart
fig_bar = go.Figure(go.Bar(
    x=latest_values_sorted['Normalized Indexed Spend Online'],
    y=latest_values_sorted['mcg'],
    orientation='h',
    marker=dict(color='skyblue')
))

fig_bar.update_layout(
    title=f'MCG Category Ranking by Normalized Indexed Spend Online in {latest_quarter}',
    xaxis_title='Normalized Indexed Spend Online (2019 Avg = 100)',
    yaxis_title='MCG Category',
    height=800
)

# Show both figures
fig_line.show()
fig_bar.show()





In [None]:
# Added Lines - Horizontal grid lines Online

import pandas as pd
import matplotlib.pyplot as plt

# Load the indexed spend data
df = pd.read_csv("Indexed_Online_MCGs.csv")

# Define the three periods of interest
periods = [
    ("2021Q4", "2022Q1", "2022Q2"),
    ("2022Q4", "2023Q1", "2023Q2"),
    ("2023Q4", "2024Q1", "2024Q2"),
    ("2024Q4", "2025Q1", "2025Q2")  
]

# Analyze each period and update charts with grid lines
for q4, q1, q2 in periods:
    # Pivot the data for easier comparison
    pivot = df[df['time_period_value'].isin([q4, q1, q2])].pivot(index='mcg', columns='time_period_value', values='Indexed Spend Online')
    
    # Drop rows with missing values
    pivot = pivot.dropna()
    
    # Calculate drop and rebound
    pivot['drop'] = pivot[q1] - pivot[q4]
    pivot['rebound'] = pivot[q2] - pivot[q1]
    pivot['total_change'] = abs(pivot['drop']) + abs(pivot['rebound'])
    
    # Sort by total change and select top 5
    top_mcg = pivot.sort_values(by='total_change', ascending=False).head(5)
    
    # Plotting with grid lines
    plt.figure(figsize=(10, 6))
    bar_width = 0.35
    index = range(len(top_mcg))
    
    plt.bar(index, top_mcg['drop'], bar_width, label='Drop (Q4 to Q1)', color='skyblue')
    plt.bar([i + bar_width for i in index], top_mcg['rebound'], bar_width, label='Rebound (Q1 to Q2)', color='plum')
    
    plt.xlabel('MCG')
    plt.ylabel('Indexed Spend Online Change')
    plt.title(f'Top MCGs with Sharp Drop and Rebound: {q4} to {q2}')
    plt.xticks([i + bar_width / 2 for i in index], top_mcg.index, rotation=45)
    plt.legend()
    plt.grid(axis='y', linestyle='--', linewidth=0.5)
    plt.tight_layout()
    plt.savefig(f"Indexed_Spend_Online_Change_{q4}_{q1}_{q2}.png")
    plt.close()

print("Updated charts with grid lines have been saved.")


In [None]:
# Horizontal grid lines Bar Charts

from PIL import Image
import matplotlib.pyplot as plt

# List of chart image filenames
chart_files = [
    "Indexed_Spend_Online_Change_2021Q4_2022Q1_2022Q2.png",
    "Indexed_Spend_Online_Change_2022Q4_2023Q1_2023Q2.png",
    "Indexed_Spend_Online_Change_2023Q4_2024Q1_2024Q2.png",
    "Indexed_Spend_Online_Change_2024Q4_2025Q1_2025Q2.png"
]

# Display each chart
for file in chart_files:
    img = Image.open(file)
    plt.figure(figsize=(10, 6))
    plt.imshow(img)
    plt.axis('off')
    plt.title(file.replace("_", " ").replace(".png", ""))
    plt.show()


In [None]:
# Diamond Pattern Change % Online - Adjusted Value 

import pandas as pd

# Load the CSV file
df = pd.read_csv("Adjusted_Online_MCGs.csv")

# Ensure time_period_value is treated as a string
df['time_period_value'] = df['time_period_value'].astype(str)

# Define the periods of interest
periods = [
    ("2021Q4", "2022Q1", "2022Q2"),
    ("2022Q4", "2023Q1", "2023Q2"),
    ("2023Q4", "2024Q1", "2024Q2"),
    ("2024Q4", "2025Q1", "2025Q2")  
]

# Prepare a list to collect results
results = []

# Iterate over each MCG
for mcg in df['mcg'].unique():
    df_mcg = df[df['mcg'] == mcg].set_index('time_period_value')
    for p1, p2, p3 in periods:
        if p1 in df_mcg.index and p2 in df_mcg.index and p3 in df_mcg.index:
            val1 = df_mcg.loc[p1, 'adjusted_Online_spend']
            val2 = df_mcg.loc[p2, 'adjusted_Online_spend']
            val3 = df_mcg.loc[p3, 'adjusted_Online_spend']
            rebound_pct = ((val2 - val1) / val1) * 100
            drop_pct = ((val3 - val2) / val2) * 100
            results.append({
                'mcg': mcg,
                'period': f"{p1}→{p2}→{p3}",
                'rebound_%': round(rebound_pct, 2),
                'drop_%': round(drop_pct, 2)
            })

# Create a DataFrame from the results
change_df = pd.DataFrame(results)

# Save to CSV
change_df.to_csv("MCG_Percentage_Changes_Diamond_Online_Adj.csv", index=False)

# Output the path to the saved file
print("MCG_Percentage_Changes_Diamond_Online_Adj.csv")


In [None]:
# Diamond Pattern Change Average for the three periods % Online - Adjusted Value 

import pandas as pd

# Load the percentage change data
df = pd.read_csv("MCG_Percentage_Changes_Diamond_Online_Adj.csv")

# Group by MCG and calculate average drop and rebound percentages
avg_changes = df.groupby('mcg')[['rebound_%', 'drop_%']].mean().reset_index()

# Rename columns for clarity
avg_changes.rename(columns={'rebound_%': 'avg_rebound_%', 'drop_%': 'avg_drop_%'}, inplace=True)

# Save to a new CSV file
avg_changes.to_csv("MCG_Average_Percentage_Changes_Online_Adj.csv", index=False)

# Output the path to the saved file
print("MCG_Average_Percentage_Changes_Online_Adj.csv")


In [None]:
# Diamond Pattern Change % Online - Indexed Value

import pandas as pd

# Load the CSV file
df = pd.read_csv("Indexed_Online_MCGs.csv")

# Ensure time_period_value is treated as a string
df['time_period_value'] = df['time_period_value'].astype(str)

# Define the periods of interest
periods = [
    ("2021Q4", "2022Q1", "2022Q2"),
    ("2022Q4", "2023Q1", "2023Q2"),
    ("2023Q4", "2024Q1", "2024Q2"),
    ("2024Q4", "2025Q1", "2025Q2")  
]

# Prepare a list to collect results
results = []

# Iterate over each MCG
for mcg in df['mcg'].unique():
    df_mcg = df[df['mcg'] == mcg].set_index('time_period_value')
    for p1, p2, p3 in periods:
        if p1 in df_mcg.index and p2 in df_mcg.index and p3 in df_mcg.index:
            val1 = df_mcg.loc[p1, 'Indexed Spend Online']
            val2 = df_mcg.loc[p2, 'Indexed Spend Online']
            val3 = df_mcg.loc[p3, 'Indexed Spend Online']
            drop_pct = ((val2 - val1) / val1) * 100
            rebound_pct = ((val3 - val2) / val2) * 100
            results.append({
                'mcg': mcg,
                'period': f"{p1}→{p2}→{p3}",
                'Q4 to Q1_%': round(drop_pct, 2),
                'Q1 to Q2_%': round(rebound_pct, 2)
            })

# Create a DataFrame from the results
change_df = pd.DataFrame(results)

# Save to CSV
change_df.to_csv("MCG_Percentage_Changes_Diamond_Online.csv", index=False)

# Output the path to the saved file
print("MCG_Percentage_Changes_Diamond_Online.csv")


In [None]:
# Diamond Pattern Change Average for the three periods % Online

import pandas as pd

# Load the percentage change data
df = pd.read_csv("MCG_Percentage_Changes_Diamond_Online.csv")

# Group by MCG and calculate average drop and rebound percentages
avg_changes = df.groupby('mcg')[['Q4 to Q1_%', 'Q1 to Q2_%']].mean().reset_index()

# Rename columns for clarity
avg_changes.rename(columns={'Q4 to Q1_%': 'avg_Q4 to Q1_%', 'Q1 to Q2_%': 'avg_Q1 to Q2_%'}, inplace=True)

# Save to a new CSV file
avg_changes.to_csv("MCG_Average_Percentage_Changes_Online.csv", index=False)

# Output the path to the saved file
print("MCG_Average_Percentage_Changes_Online.csv")


In [None]:
# To calculated the average offset between indexed online and indexed face-to-face (F2F) spend 
# for each MCG across the three “diamond” quarters: 2022Q1, 2023Q1, and 2024Q1.

# Positive offset = Online spend was significantly higher than F2F spend.
# Negative offset = F2F spend was higher than online spend (or online dropped more).

import pandas as pd

# Load the indexed online and face-to-face MCG data
online_df = pd.read_csv("Indexed_Online_MCGs.csv")
f2f_df = pd.read_csv("Indexed_F2F_MCGs.csv")

# Filter for the three diamond quarters
diamond_quarters = ["2022Q1", "2023Q1", "2024Q1"]
online_filtered = online_df[online_df["time_period_value"].isin(diamond_quarters)]
f2f_filtered = f2f_df[f2f_df["time_period_value"].isin(diamond_quarters)]

# Merge the datasets on time_period_value and mcg
merged_df = pd.merge(
    online_filtered,
    f2f_filtered,
    on=["time_period_value", "mcg"],
    suffixes=("_online", "_f2f")
)

# Calculate the offset (online - f2f) for each MCG in each quarter
merged_df["offset"] = merged_df["Indexed Spend Online"] - merged_df["Indexed Spend F2F"]

# Group by MCG and calculate average offset across the three quarters
# To identify consistent patterns— MCGs that repeatedly show strong online vs. F2F divergence during Q1s.
# Smooths out volatility: If one quarter had an anomaly (e.g. a spike or dip), the average balances it.
# Highlights persistent trends: Categories like Education & Government or Food & Grocery show consistent behavior across Q1s.
# Simplifies comparison: To get a single number per MCG to rank and compare.
offset_summary = merged_df.groupby("mcg")["offset"].mean().reset_index()
offset_summary = offset_summary.sort_values(by="offset", ascending=False)

# Display top 10 MCGs with strongest positive and negative offsets
top_positive_offsets = offset_summary.head(10)
top_negative_offsets = offset_summary.tail(10)

top_positive_offsets, top_negative_offsets



In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the indexed online and face-to-face MCG data
online_df = pd.read_csv("Indexed_Online_MCGs.csv")
f2f_df = pd.read_csv("Indexed_F2F_MCGs.csv")

# Filter for the three diamond quarters
diamond_quarters = ["2022Q1", "2023Q1", "2024Q1"]
online_filtered = online_df[online_df["time_period_value"].isin(diamond_quarters)]
f2f_filtered = f2f_df[f2f_df["time_period_value"].isin(diamond_quarters)]

# Merge the datasets on time_period_value and mcg
merged_df = pd.merge(
    online_filtered,
    f2f_filtered,
    on=["time_period_value", "mcg"],
    suffixes=("_online", "_f2f")
)

# Calculate the offset (online - f2f) for each MCG in each quarter
merged_df["offset"] = merged_df["Indexed Spend Online"] - merged_df["Indexed Spend F2F"]

# Group by MCG and calculate average offset across the three quarters
offset_summary = merged_df.groupby("mcg")["offset"].mean().reset_index()
offset_summary = offset_summary.sort_values(by="offset", ascending=False)

# Display top 10 MCGs with strongest positive and negative offsets
top_positive_offsets = offset_summary.head(10)
top_negative_offsets = offset_summary.tail(10)

# Combine for visualization
combined_offsets = pd.concat([top_positive_offsets, top_negative_offsets])
combined_offsets = combined_offsets.sort_values(by="offset")

# Plotting
plt.figure(figsize=(12, 8))
bars = plt.barh(combined_offsets["mcg"], combined_offsets["offset"], color=['skyblue' if x > 0 else 'plum' for x in combined_offsets["offset"]])
                                                                        
plt.xlabel("Average Offset (Online - F2F)")
plt.title("Top 10 Positive and Negative MCG Offsets (Average across 2022Q1, 2023Q1, 2024Q1)")
plt.axvline(0, color='black', linewidth=0.8)
plt.grid(axis='x', linestyle='--', alpha=0.7)  # Add vertical gridlines
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add horizontal gridlines
plt.tight_layout()
plt.savefig("offset_comparison.png")
plt.show()




In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the indexed online and face-to-face MCG data
online_df = pd.read_csv("Indexed_Online_MCGs.csv")
f2f_df = pd.read_csv("Indexed_F2F_MCGs.csv")

# Filter for the three diamond quarters
diamond_quarters = ["2022Q1", "2023Q1", "2024Q1"]
online_filtered = online_df[online_df["time_period_value"].isin(diamond_quarters)]
f2f_filtered = f2f_df[f2f_df["time_period_value"].isin(diamond_quarters)]

# Merge the datasets on time_period_value and mcg
merged_df = pd.merge(
    online_filtered,
    f2f_filtered,
    on=["time_period_value", "mcg"],
    suffixes=("_online", "_f2f")
)

# Calculate the offset (online - f2f) for each MCG in each quarter
merged_df["offset"] = merged_df["Indexed Spend Online"] - merged_df["Indexed Spend F2F"]

# Group by MCG and calculate average offset across the three quarters
offset_summary = merged_df.groupby("mcg")["offset"].mean().reset_index()
offset_summary = offset_summary.sort_values(by="offset", ascending=False)

# Assign colors based on offset values
colors = ['skyblue' if val >= 0 else 'plum' for val in offset_summary["offset"]]

# Create the bar plot with horizontal gridlines
plt.figure(figsize=(12, 8))
plt.bar(offset_summary["mcg"], offset_summary["offset"], color=colors)
plt.xticks(rotation=90)
plt.xlabel("MCG")
plt.ylabel("Average Offset (Online - F2F)")
plt.title("Average Offset Comparison Across Diamond Quarters")
plt.grid(axis='x', linestyle='--', alpha=0.7)  # Add vertical gridlines
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add horizontal gridlines
plt.tight_layout()

# Save the plot
plt.savefig("offset_comparison_with_gridlines.png")
plt.show()

