In [99]:
import pandas as pd
import numpy as np

In [100]:
df = pd.read_csv('../output/voice_video_merged.csv')


Columns (4,6) have mixed types. Specify dtype option on import or set low_memory=False.



In [101]:
df.columns

Index(['InteractionID', 'StoreID', 'ProductID', 'TransactionDate', 'DeviceID',
       'FacialID', 'Sex', 'Age', 'EmotionalState', 'TranscriptionText',
       'Gender', 'Barangay', 'canonical_tx_id_norm', 'canonical_tx_id',
       'storeId_voice', 'deviceId_voice', 'timestamp_voice', 'transactionId',
       'brandDetection_voice', 'items_voice', 'totals_voice',
       'transactionContext_voice', 'privacy_voice', 'processingTime_voice',
       'edgeVersion_voice', '_file_path_voice', 'privacy_audioStored_voice',
       'privacy_brandAnalysisOnly_voice', 'privacy_noFacialRecognition_voice',
       'privacy_noImageProcessing_voice', 'privacy_dataRetentionDays_voice',
       'privacy_anonymizationLevel_voice', 'privacy_consentTimestamp_voice',
       'totals_totalAmount_voice', 'totals_totalItems_voice',
       'totals_brandedAmount_voice', 'totals_unbrandedAmount_voice',
       'totals_brandedCount_voice', 'totals_unbrandedCount_voice',
       'totals_uniqueBrandsCount_voice', 'transaction

In [102]:
filtered = df[df['InteractionID'].notna() & df['transactionId'].notna()].copy()

In [103]:
def normalize_iso(ts):
    if pd.isna(ts):
        return ts
    ts = str(ts)
    if ts.endswith('.000Z') and ts.count('.') > 1:
        return ts[:-4] + 'Z'     # trim the extra `.000`
    return ts

filtered['privacy_consentTimestamp_voice'] = (
    filtered['privacy_consentTimestamp_voice']
      .apply(normalize_iso)
      .pipe(pd.to_datetime, errors='coerce', utc=True)
)


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



In [104]:
filtered['TransactionDate'] = pd.to_datetime(filtered['TransactionDate'])
filtered['privacy_consentTimestamp_voice'] = pd.to_datetime(filtered['privacy_consentTimestamp_voice'])
filtered['TransactionDate'] = filtered['TransactionDate'].fillna(filtered['privacy_consentTimestamp_voice'])

## Extended Exploration Set-Up


In [122]:
import json
from ast import literal_eval
import plotly.express as px
import plotly.graph_objects as go


def safe_json(value, default):
    """Convert mixed JSON strings/lists/dicts into python objects."""
    if pd.isna(value):
        return default
    if isinstance(value, (dict, list)):
        return value
    text = str(value).strip()
    if not text:
        return default
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        try:
            return literal_eval(text)
        except (ValueError, SyntaxError):
            return default


def ensure_datetime(series):
    """Standardize datetime with UTC awareness."""
    return pd.to_datetime(series, errors='coerce', utc=True)



In [123]:
# --- data hygiene -----------------------------------------------------------

filtered['TransactionDate'] = ensure_datetime(filtered['TransactionDate'])
filtered['privacy_consentTimestamp_voice'] = ensure_datetime(filtered['privacy_consentTimestamp_voice'])
filtered['TransactionDate'] = filtered['TransactionDate'].fillna(filtered['privacy_consentTimestamp_voice'])

# standardize gender entries (strip stray quotes) then drop unknowns
filtered['gender_clean'] = (
    filtered['Gender']
    .astype(str)
    .str.strip()
    .str.strip("'\"")
    .str.title()
)
filtered['gender_clean'] = filtered['gender_clean'].where(filtered['gender_clean'].isin(['Male', 'Female']))
filtered = filtered.dropna(subset=['TransactionDate', 'gender_clean'])

# numeric age + bucket
filtered['Age'] = pd.to_numeric(filtered['Age'], errors='coerce')
age_bins = [0, 17, 24, 34, 44, 54, 120]
age_labels = ['<18', '18-24', '25-34', '35-44', '45-54', '55+']
filtered['age_bucket'] = pd.cut(filtered['Age'], bins=age_bins, labels=age_labels, right=True)

# convenience date columns
filtered['txn_date'] = filtered['TransactionDate'].dt.date
filtered['txn_month'] = filtered['TransactionDate'].dt.to_period('M').dt.to_timestamp()
filtered['txn_weekday'] = filtered['TransactionDate'].dt.day_name()
filtered['txn_hour'] = filtered['TransactionDate'].dt.hour





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Converting to PeriodArray/Index representation will drop timezone information.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value inst

In [124]:
# --- item-level expansion for product analytics ----------------------------

items_series = filtered['items_voice'].apply(lambda x: safe_json(x, []))
items_df = filtered[['InteractionID', 'TransactionDate', 'gender_clean', 'age_bucket', 'Age', 'transactionContext_paymentMethod_voice', 'totals_totalAmount_voice']].copy()
items_df['items'] = items_series
items_df = items_df.explode('items').dropna(subset=['items'])
item_details = pd.json_normalize(items_df['items'])
items_df = pd.concat([items_df.reset_index(drop=True), item_details], axis=1).drop(columns=['items'])

for col in ['totalPrice', 'unitPrice', 'quantity']:
    if col not in items_df.columns:
        items_df[col] = np.nan
items_df['totalPrice'] = pd.to_numeric(items_df['totalPrice'], errors='coerce')
items_df['unitPrice'] = pd.to_numeric(items_df['unitPrice'], errors='coerce')
items_df['quantity'] = pd.to_numeric(items_df['quantity'], errors='coerce')
for text_col in ['category', 'brandName', 'productName', 'sku']:
    if text_col not in items_df.columns:
        items_df[text_col] = np.nan
items_df['category'] = items_df['category'].fillna('Unspecified')
items_df['brandName'] = items_df['brandName'].fillna('Unspecified')

# helper for time-of-day segments
bins = [0, 5, 12, 18, 22, 24]
labels = ['Late Night (10p-5a)', 'Morning (5a-12p)', 'Afternoon (12p-6p)', 'Evening (6p-10p)', 'Late Night (10p-12a)']
filtered['timeofday_segment'] = pd.cut(filtered['txn_hour'], bins=bins, labels=labels, right=False, include_lowest=True)
items_df = items_df.merge(filtered[['InteractionID', 'timeofday_segment', 'txn_weekday']], on='InteractionID', how='left')





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Demographics — Gender View


In [125]:
gender_summary = (
    filtered.groupby('gender_clean')
      .agg(total_transactions=('InteractionID', 'count'),
           avg_spend=('totals_totalAmount_voice', 'mean'))
      .reset_index()
)

fig_gender_txn = px.bar(
    gender_summary,
    x='gender_clean',
    y='total_transactions',
    color='gender_clean',
    title='Total Transactions by Gender',
    labels={'gender_clean': 'Gender', 'total_transactions': 'Transactions'},
    text_auto=True
)
fig_gender_txn.update_layout(showlegend=False)

fig_gender_avg = px.bar(
    gender_summary,
    x='gender_clean',
    y='avg_spend',
    color='gender_clean',
    title='Average Basket Spend by Gender',
    labels={'avg_spend': 'Average Spend (₱)'},
    text_auto='.2f'
)
fig_gender_avg.update_layout(showlegend=False)

monthly_gender = (
    filtered.groupby(['txn_month', 'gender_clean'])
      .agg(total_transactions=('InteractionID', 'count'),
           avg_spend=('totals_totalAmount_voice', 'mean'))
      .reset_index()
)

fig_gender_mom = px.line(
    monthly_gender,
    x='txn_month',
    y='total_transactions',
    color='gender_clean',
    markers=True,
    title='Month-on-Month Transactions by Gender',
    labels={'txn_month': 'Month', 'total_transactions': 'Transactions'}
)

fig_gender_txn.show()
fig_gender_avg.show()
fig_gender_mom.show()




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



## Demographics — Age Buckets


In [126]:
age_summary = (
    filtered.dropna(subset=['age_bucket'])
      .groupby('age_bucket')
      .agg(total_transactions=('InteractionID', 'count'),
           avg_spend=('totals_totalAmount_voice', 'mean'))
      .reset_index()
)

fig_age = px.bar(
    age_summary,
    x='age_bucket',
    y='total_transactions',
    color='avg_spend',
    title='Transactions & Avg Spend by Age Bucket',
    labels={'age_bucket': 'Age Bucket', 'total_transactions': 'Transactions', 'avg_spend': 'Avg Spend'},
    text='avg_spend',
    color_continuous_scale='Blues'
)
fig_age.update_traces(texttemplate='₱%{text:.0f}', textposition='outside')
fig_age.show()







## Demographics — Tender Type (Cash vs E-Wallet)


In [127]:
filtered.columns

Index(['InteractionID', 'StoreID', 'ProductID', 'TransactionDate', 'DeviceID',
       'FacialID', 'Sex', 'Age', 'EmotionalState', 'TranscriptionText',
       'Gender', 'Barangay', 'canonical_tx_id_norm', 'canonical_tx_id',
       'storeId_voice', 'deviceId_voice', 'timestamp_voice', 'transactionId',
       'brandDetection_voice', 'items_voice', 'totals_voice',
       'transactionContext_voice', 'privacy_voice', 'processingTime_voice',
       'edgeVersion_voice', '_file_path_voice', 'privacy_audioStored_voice',
       'privacy_brandAnalysisOnly_voice', 'privacy_noFacialRecognition_voice',
       'privacy_noImageProcessing_voice', 'privacy_dataRetentionDays_voice',
       'privacy_anonymizationLevel_voice', 'privacy_consentTimestamp_voice',
       'totals_totalAmount_voice', 'totals_totalItems_voice',
       'totals_brandedAmount_voice', 'totals_unbrandedAmount_voice',
       'totals_brandedCount_voice', 'totals_unbrandedCount_voice',
       'totals_uniqueBrandsCount_voice', 'transaction

In [128]:
tender_summary = (
    filtered.groupby('transactionContext_paymentMethod_voice')
      .agg(total_transactions=('InteractionID', 'count'),
           avg_spend=('totals_totalAmount_voice', 'mean'))
      .reset_index()
)

fig_tender = px.bar(
    tender_summary,
    x='transactionContext_paymentMethod_voice',
    y='total_transactions',
    color='transactionContext_paymentMethod_voice',
    title='Transactions by Tender Type',
    text_auto=True,
    labels={'transactionContext_paymentMethod_voice': 'Payment Method', 'total_transactions': 'Transactions'}
)
fig_tender.update_layout(showlegend=False)

fig_tender_avg = px.scatter(
    tender_summary,
    x='transactionContext_paymentMethod_voice',
    y='avg_spend',
    size='total_transactions',
    color='avg_spend',
    title='Average Spend by Tender Type',
    labels={'avg_spend': 'Average Spend (₱)'},
    color_continuous_scale='Viridis'
)

fig_tender.show()
fig_tender_avg.show()



## Shopping Behavior — Weekday / Weekend & Time of Day


In [129]:
filtered['weekday_type'] = np.where(filtered['TransactionDate'].dt.dayofweek >= 5, 'Weekend', 'Weekday')

week_summary = (
    filtered.groupby('weekday_type')
      .agg(total_transactions=('InteractionID', 'count'),
           avg_spend=('totals_totalAmount_voice', 'mean'))
      .reset_index()
)

fig_week = px.bar(
    week_summary,
    x='weekday_type',
    y='total_transactions',
    color='weekday_type',
    text_auto=True,
    title='Transactions: Weekday vs Weekend'
)
fig_week.update_layout(showlegend=False)

fig_week_avg = px.bar(
    week_summary,
    x='weekday_type',
    y='avg_spend',
    color='weekday_type',
    text_auto='.2f',
    title='Average Spend: Weekday vs Weekend'
)
fig_week_avg.update_layout(showlegend=False)

timeofday_summary = (
    filtered.dropna(subset=['timeofday_segment'])
      .groupby(['weekday_type', 'timeofday_segment'])
      .agg(total_transactions=('InteractionID', 'count'),
           avg_spend=('totals_totalAmount_voice', 'mean'))
      .reset_index()
)

fig_time = px.bar(
    timeofday_summary,
    x='timeofday_segment',
    y='total_transactions',
    color='weekday_type',
    barmode='group',
    title='Transactions by Time of Day (Weekday vs Weekend)',
    labels={'timeofday_segment': 'Time of Day'}
)

fig_time_avg = px.line(
    timeofday_summary,
    x='timeofday_segment',
    y='avg_spend',
    color='weekday_type',
    markers=True,
    title='Average Spend by Time of Day'
)

for fig in [fig_week, fig_week_avg, fig_time, fig_time_avg]:
    fig.show()





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy





## Shopping Behavior — Payday Windows


In [130]:
payday_days = {12, 13, 14, 27, 28, 29}
filtered['payday_window'] = np.where(filtered['TransactionDate'].dt.day.isin(payday_days), 'Payday Window', 'Rest of Month')

payday_summary = (
    filtered.groupby('payday_window')
      .agg(total_transactions=('InteractionID', 'count'),
           avg_spend=('totals_totalAmount_voice', 'mean'),
           total_revenue=('totals_totalAmount_voice', 'sum'))
      .reset_index()
)

fig_payday = px.bar(
    payday_summary,
    x='payday_window',
    y='total_transactions',
    color='payday_window',
    text_auto=True,
    title='Transactions During Payday Windows'
)
fig_payday.update_layout(showlegend=False)

fig_payday_avg = px.bar(
    payday_summary,
    x='payday_window',
    y='avg_spend',
    color='payday_window',
    text_auto='.2f',
    title='Average Spend During Payday Windows'
)
fig_payday_avg.update_layout(showlegend=False)

fig_payday_rev = px.pie(
    payday_summary,
    names='payday_window',
    values='total_revenue',
    title='Revenue Split — Payday vs Rest of Month'
)

fig_payday.show()
fig_payday_avg.show()
fig_payday_rev.show()





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Basket Value & Price-Point Signals


In [131]:
basket_bins = [0, 10, 20, 50, 100, 200, np.inf]
basket_labels = ['₱0-10', '₱11-20', '₱21-50', '₱51-100', '₱101-200', '₱200+']
filtered['basket_band'] = pd.cut(filtered['totals_totalAmount_voice'], bins=basket_bins, labels=basket_labels, right=True)

basket_summary = (
    filtered.dropna(subset=['basket_band'])
      .groupby('basket_band')
      .agg(transactions=('InteractionID', 'count'),
           avg_spend=('totals_totalAmount_voice', 'mean'))
      .reset_index()
)

fig_basket = px.bar(
    basket_summary,
    x='basket_band',
    y='transactions',
    color='avg_spend',
    title='Basket Value Distribution',
    text='transactions',
    labels={'basket_band': 'Basket Band', 'transactions': 'Transactions'},
    color_continuous_scale='Tealgrn'
)
fig_basket.show()

# SKU price points at 5/10/15 pesos (±0.25 tolerance)
round_targets = [5, 10, 15]
tolerance = 0.25

def closest_round(value):
    if pd.isna(value):
        return None
    for target in round_targets:
        if abs(value - target) <= tolerance:
            return f'₱{target}'
    return None

items_df['round_price_flag'] = items_df['unitPrice'].apply(closest_round)
round_summary = (
    items_df.dropna(subset=['round_price_flag'])
            .groupby('round_price_flag')
            .agg(freq=('sku', 'count'))
            .reset_index()
)

fig_round = px.pie(
    round_summary,
    names='round_price_flag',
    values='freq',
    title='Share of Items Sold at ₱5 / ₱10 / ₱15',
    hole=0.35
)

fig_round.show()





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy





## Top Products by Daypart


In [132]:
daypart_products = (
    items_df.dropna(subset=['timeofday_segment'])
            .groupby(['timeofday_segment', 'brandName'])
            .agg(quantity=('quantity', 'sum'),
                 revenue=('totalPrice', 'sum'))
            .reset_index()
)

# keep top 5 per segment
ranked = daypart_products.sort_values(['timeofday_segment', 'quantity'], ascending=[True, False])
ranked['rank'] = ranked.groupby('timeofday_segment')['quantity'].rank(method='first', ascending=False)
top_daypart = ranked[ranked['rank'] <= 5]

fig_daypart = px.bar(
    top_daypart,
    x='brandName',
    y='quantity',
    color='revenue',
    facet_col='timeofday_segment',
    facet_col_wrap=2,
    title='Top 5 Brands per Daypart',
    labels={'brandName': 'Brand', 'quantity': 'Qty', 'revenue': 'Revenue'},
    color_continuous_scale='OrRd'
)
fig_daypart.for_each_xaxis(lambda ax: ax.update(tickangle=45))
fig_daypart.update_layout(height=600)
fig_daypart.show()











## Category Performance Breakdown


In [133]:
category_summary = (
    items_df.groupby('category')
            .agg(units=('quantity', 'sum'),
                 revenue=('totalPrice', 'sum'))
            .reset_index()
            .sort_values('revenue', ascending=False)
)

fig_cat_rank = px.bar(
    category_summary.head(15),
    x='category',
    y='revenue',
    text='units',
    title='Top Categories by Revenue (text = units)',
    labels={'revenue': 'Revenue', 'category': 'Category'}
)
fig_cat_rank.update_traces(texttemplate='%{text:.0f} units', textposition='outside')
fig_cat_rank.update_layout(xaxis_tickangle=45)
fig_cat_rank.show()

category_by_day = (
    items_df.groupby(['txn_weekday', 'category'])
            .agg(units=('quantity', 'sum'))
            .reset_index()
)
heat_day = category_by_day.pivot(index='txn_weekday', columns='category', values='units').fillna(0)
fig_cat_day = px.imshow(
    heat_day,
    aspect='auto',
    color_continuous_scale='Purples',
    title='Category Units by Day of Week'
)
fig_cat_day.show()

category_by_gender = (
    items_df.groupby(['gender_clean', 'category'])
            .agg(units=('quantity', 'sum'))
            .reset_index()
)
fig_cat_gender = px.bar(
    category_by_gender,
    x='category',
    y='units',
    color='gender_clean',
    barmode='group',
    title='Category Units by Gender'
)
fig_cat_gender.update_layout(xaxis_tickangle=45)
fig_cat_gender.show()

category_by_age = (
    items_df.dropna(subset=['age_bucket'])
            .groupby(['age_bucket', 'category'])
            .agg(units=('quantity', 'sum'))
            .reset_index()
)
fig_cat_age = px.bar(
    category_by_age,
    x='age_bucket',
    y='units',
    color='category',
    title='Category Mix by Age Bucket',
    labels={'age_bucket': 'Age Bucket'},
    barmode='stack'
)
fig_cat_age.show()







## Category Composition by Basket Band


In [134]:
items_with_bands = items_df.merge(filtered[['InteractionID', 'basket_band']], on='InteractionID', how='left')
composition = (
    items_with_bands.dropna(subset=['basket_band'])
                    .groupby(['basket_band', 'category'])
                    .agg(revenue=('totalPrice', 'sum'))
                    .reset_index()
)

fig_comp = px.bar(
    composition,
    x='basket_band',
    y='revenue',
    color='category',
    title='Category Composition within Basket Bands',
    labels={'basket_band': 'Basket Band', 'revenue': 'Revenue'},
    barmode='stack'
)
fig_comp.show()







## Frequently Paired Products


In [135]:
from itertools import combinations

pairs = (
    items_df.groupby('InteractionID')['brandName']
            .apply(lambda brands: list(combinations(sorted(set([b for b in brands if isinstance(b, str) and b.strip()])), 2)))
            .explode()
            .dropna()
)

# Count pairs and convert to DataFrame
pair_counts = pairs.value_counts().reset_index()
# The first column contains the pair tuples, rename it
pair_counts.columns = ['pair', 'frequency']

# Extract Brand A and Brand B from the pair tuples
pair_counts[['Brand A', 'Brand B']] = pd.DataFrame(pair_counts['pair'].tolist(), index=pair_counts.index)

fig_pairs = px.bar(
    pair_counts.head(15),
    x='frequency',
    y='Brand A',
    color='Brand B',
    orientation='h',
    title='Top Product Pairings',
    labels={'frequency': 'Co-occurrences'}
)
fig_pairs.show()


## Tobacco Analysis


In [138]:
tobacco_keywords = ['tobacco', 'cig', 'marlboro', 'fort', 'chester', 'winston', 'hope']

def is_tobacco(row):
    text = ' '.join([
        str(row.get('category', '')),
        str(row.get('brandName', '')),
        str(row.get('productName', ''))
    ]).lower()
    return any(keyword in text for keyword in tobacco_keywords)

tobacco_df = items_df[items_df.apply(is_tobacco, axis=1)].copy()

if not tobacco_df.empty:
    tobacco_df['hour'] = tobacco_df['TransactionDate'].dt.hour
    tobacco_df['weekday'] = tobacco_df['TransactionDate'].dt.day_name()

    time_qty = (
        tobacco_df.groupby('hour')
                  .agg(total_qty=('quantity', 'sum'),
                       avg_qty=('quantity', 'mean'))
                  .reset_index()
    )
    fig_tob_time = px.line(
        time_qty,
        x='hour',
        y='total_qty',
        markers=True,
        title='Tobacco Volume by Hour of Day',
        labels={'hour': 'Hour', 'total_qty': 'Qty'}
    )
    fig_tob_time.add_trace(go.Bar(x=time_qty['hour'], y=time_qty['avg_qty'], name='Avg Qty', opacity=0.3))
    fig_tob_time.show()

    brand_summary = (
        tobacco_df.groupby('brandName')
                  .agg(total_qty=('quantity', 'sum'),
                       revenue=('totalPrice', 'sum'))
                  .reset_index()
                  .sort_values('total_qty', ascending=False)
    )
    fig_tob_brand = px.bar(
        brand_summary.head(10),
        x='brandName',
        y='total_qty',
        color='revenue',
        title='Top Tobacco Brands',
        labels={'brandName': 'Brand', 'total_qty': 'Qty'}
    )
    fig_tob_brand.show()

    brand_day = (
        tobacco_df.groupby(['weekday', 'brandName'])
                  .agg(total_qty=('quantity', 'sum'))
                  .reset_index()
    )
    fig_tob_brand_day = px.bar(
        brand_day,
        x='weekday',
        y='total_qty',
        color='brandName',
        title='Tobacco Qty by Day x Brand',
        barmode='stack'
    )
    fig_tob_brand_day.show()

    day_avg = (
        tobacco_df.groupby('weekday')
                  .agg(avg_qty=('quantity', 'mean'))
                  .reset_index()
    )
    fig_tob_day_avg = px.bar(
        day_avg,
        x='weekday',
        y='avg_qty',
        title='Average Tobacco Qty per Transaction by Day'
    )
    fig_tob_day_avg.show()
else:
    print('No tobacco records detected — adjust keyword list if needed.')



### Tobacco — Demographic Splits


In [139]:
if not tobacco_df.empty:
    tob_gender = (
        tobacco_df.groupby('gender_clean')
                  .agg(total_qty=('quantity', 'sum'))
                  .reset_index()
    )
    fig_tob_gender = px.bar(
        tob_gender,
        x='gender_clean',
        y='total_qty',
        title='Tobacco Qty by Gender',
        text_auto=True
    )
    fig_tob_gender.show()

    tob_age = (
        tobacco_df.dropna(subset=['age_bucket'])
                  .groupby('age_bucket')
                  .agg(total_qty=('quantity', 'sum'))
                  .reset_index()
    )
    fig_tob_age = px.bar(
        tob_age,
        x='age_bucket',
        y='total_qty',
        title='Tobacco Qty by Age Bucket'
    )
    fig_tob_age.show()

    tob_gender_brand = (
        tobacco_df.groupby(['gender_clean', 'brandName'])
                  .agg(total_qty=('quantity', 'sum'))
                  .reset_index()
    )
    fig_tob_gender_brand = px.bar(
        tob_gender_brand,
        x='brandName',
        y='total_qty',
        color='gender_clean',
        title='Tobacco Brands by Gender',
        barmode='group'
    )
    fig_tob_gender_brand.update_layout(xaxis_tickangle=45)
    fig_tob_gender_brand.show()







## Marlboro Basket Attachments


In [140]:
marlboro_txn_ids = items_df[items_df['brandName'].str.contains('marlboro', case=False, na=False)]['InteractionID'].unique()
marlboro_baskets = items_df[items_df['InteractionID'].isin(marlboro_txn_ids)]

co_brands = (
    marlboro_baskets[~marlboro_baskets['brandName'].str.contains('marlboro', case=False, na=False)]
                 .groupby('brandName')
                 .agg(co_occurrences=('InteractionID', 'nunique'),
                      units=('quantity', 'sum'),
                      revenue=('totalPrice', 'sum'))
                 .reset_index()
                 .sort_values('co_occurrences', ascending=False)
)
fig_marlboro_brands = px.bar(
    co_brands.head(15),
    x='co_occurrences',
    y='brandName',
    orientation='h',
    title='Brands Purchased with Marlboro',
    labels={'co_occurrences': 'Number of Shared Transactions'}
)
fig_marlboro_brands.show()

co_categories = (
    marlboro_baskets.groupby('category')
                    .agg(co_occurrences=('InteractionID', 'nunique'))
                    .reset_index()
                    .sort_values('co_occurrences', ascending=False)
)
fig_marlboro_categories = px.bar(
    co_categories,
    x='category',
    y='co_occurrences',
    title='Categories Purchased with Marlboro'
)
fig_marlboro_categories.update_layout(xaxis_tickangle=45)
fig_marlboro_categories.show()



## Laundry Analysis


In [141]:
laundry_keywords = ['laundry', 'detergent', 'surf', 'tide', 'breeze', 'downy', 'perla']

def is_laundry(row):
    text = ' '.join([
        str(row.get('category', '')),
        str(row.get('brandName', '')),
        str(row.get('productName', ''))
    ]).lower()
    return any(keyword in text for keyword in laundry_keywords)

laundry_df = items_df[items_df.apply(is_laundry, axis=1)].copy()

if not laundry_df.empty:
    laundry_df['hour'] = laundry_df['TransactionDate'].dt.hour
    laundry_df['weekday'] = laundry_df['TransactionDate'].dt.day_name()

    laundry_time = (
        laundry_df.groupby('hour')
                  .agg(total_qty=('quantity', 'sum'))
                  .reset_index()
    )
    fig_laundry_time = px.line(
        laundry_time,
        x='hour',
        y='total_qty',
        markers=True,
        title='Laundry Purchases by Hour'
    )
    fig_laundry_time.show()

    laundry_day_avg = (
        laundry_df.groupby('weekday')
                  .agg(avg_qty=('quantity', 'mean'))
                  .reset_index()
    )
    fig_laundry_day = px.bar(
        laundry_day_avg,
        x='weekday',
        y='avg_qty',
        title='Laundry Avg Quantity by Weekday'
    )
    fig_laundry_day.show()

    laundry_brand = (
        laundry_df.groupby('brandName')
                  .agg(transactions=('InteractionID', 'nunique'),
                       avg_qty=('quantity', 'mean'))
                  .reset_index()
                  .sort_values('transactions', ascending=False)
    )
    fig_laundry_brand = px.scatter(
        laundry_brand,
        x='transactions',
        y='avg_qty',
        size='transactions',
        color='brandName',
        title='Laundry Brands — Volume vs Avg Qty'
    )
    fig_laundry_brand.show()

    laundry_gender = (
        laundry_df.groupby('gender_clean')
                  .agg(total_qty=('quantity', 'sum'))
                  .reset_index()
    )
    fig_laundry_gender = px.bar(
        laundry_gender,
        x='gender_clean',
        y='total_qty',
        title='Laundry Qty by Gender'
    )
    fig_laundry_gender.show()

    laundry_age = (
        laundry_df.dropna(subset=['age_bucket'])
                  .groupby('age_bucket')
                  .agg(total_qty=('quantity', 'sum'))
                  .reset_index()
    )
    fig_laundry_age = px.bar(
        laundry_age,
        x='age_bucket',
        y='total_qty',
        title='Laundry Qty by Age Bucket'
    )
    fig_laundry_age.show()

    laundry_gender_brand = (
        laundry_df.groupby(['gender_clean', 'brandName'])
                  .agg(total_qty=('quantity', 'sum'))
                  .reset_index()
    )
    fig_laundry_gender_brand = px.bar(
        laundry_gender_brand,
        x='brandName',
        y='total_qty',
        color='gender_clean',
        barmode='group',
        title='Laundry Brands by Gender'
    )
    fig_laundry_gender_brand.update_layout(xaxis_tickangle=45)
    fig_laundry_gender_brand.show()

    # Surf-focused baskets
    surf_txn_ids = laundry_df[laundry_df['brandName'].str.contains('surf', case=False, na=False)]['InteractionID'].unique()
    surf_baskets = laundry_df[laundry_df['InteractionID'].isin(surf_txn_ids)]
    surf_pairs = (
        surf_baskets[~surf_baskets['brandName'].str.contains('surf', case=False, na=False)]
                    .groupby('brandName')
                    .agg(co_occurrences=('InteractionID', 'nunique'))
                    .reset_index()
                    .sort_values('co_occurrences', ascending=False)
    )
    fig_surf = px.bar(
        surf_pairs.head(15),
        x='co_occurrences',
        y='brandName',
        orientation='h',
        title='Items Purchased with Surf'
    )
    fig_surf.show()
else:
    print('No laundry records detected — adjust keyword list if needed.')







## Notes for Metabase Implementation

- Each Plotly figure corresponds to a potential Metabase card; replicate the grouping logic (gender, daypart, basket bands, etc.) with the same aggregations.
- The helper columns (`gender_clean`, `age_bucket`, `basket_band`, `weekday_type`, etc.) can be materialized in SQL or a view to keep Metabase queries lightweight.
- Nested JSON fields (`totals_voice`, `transactionContext_voice`, `items_voice`) are flattened into `totals_*`, `context_*`, and `items_df`; mirror this flattening when building warehouse tables so filters remain fast.
- Daypart, payday windows, and round-price checks rely on simple CASE expressions, making them easy to port into SQL views feeding Metabase.
