# 2024 Segmentation - Mint Subs at Gross Add Month

In [None]:
%pip install snowflake-connector-python seaborn

In [None]:
# Packages
import numpy as np
import pandas as pd
import seaborn as sns
import snowflake.connector as snow
import matplotlib.pyplot as plt
import textwrap
from matplotlib.ticker import FixedLocator
from matplotlib.ticker import ScalarFormatter

import snowflake.connector as snow
from snowflake.connector import pandas_tools

In [None]:
# Connect
# Connection Details
con = snow.connect(
    user="DESENSITIZED",
    server="DESENSITIZED",
    database="DESENSITIZED",
    warehouse="DESENSITIZED",
    authenticator="externalbrowser",
    account="DESENSITIZED"
)

cur = con.cursor()

In [None]:
# Get Post-Modelling Output, don't consider nulls for now
joined_post_mdl_sql = '''
DESENSITIZED
'''

# Fetch the result set from the cursor and deliver it as the pandas DataFrame.
# Grab Dataframes
cur.execute(joined_post_mdl_sql)
df = cur.fetch_pandas_all()

In [None]:
print(df.shape)

In [None]:
print(df.columns.values)

In [None]:
# Treat Months as effectively discrete
df["PROD_DURATION_MONTHS"] = df["PROD_DURATION_MONTHS"].astype('str')
# Replace 'UNLIMITED_PLUS' with 'UNLIMITED' in 'PROD_DATA_GROUP' column
df['PROD_DATA_GROUP'] = df['PROD_DATA_GROUP'].replace('UNLIMITED_PLUS', 'UNLIMITED')
# Change to Enforce to be Integer
df['SEGM_CLUST'] = pd.to_numeric(df['SEGM_CLUST'], errors='coerce').astype('Int64')
# Enforce SNAPSHOT_DATE as Datetime
df['SNAPSHOT_DATE'] = pd.to_datetime(df['SNAPSHOT_DATE'])

In [None]:
df.head()

In [None]:
# Define the mapping of cluster labels to descriptions
cluster_descriptions = {
    0: "African-American Dense w/ Low Churn & QoL",
    1: "Standard Baseline Subscribers w/ Low Churn",
    2: "African-American Dense w/ Moderate Churn & Tech Struggles",
    3: "Asian Dense w/ Strong Socioeconomic Traits & Solid Tenure",
    4: "Hispanics Dense w/ Struggling Socioeconomic Signs and Frequent Commutes",
    5: "Asian-Prevalent Promotion \'Chasers\'",
    6: "Young Urbanites with National Retail Relevance",
    7: "Well-Tenured & Socioeconomic Affluent w/ Low Promo Relevance",
    8: "Well-off White Dense, w/ Low Churn and Strong Tenure",
    9: "Well-off White Dense, w/ Less Usage & Some Tech Gaps",
    10: "Hispanic & \'Others\' Dense w/ Poor Usage Metrics & Socioeconomic Indicators",
    11: "Low-Income Heavily White Dense",
    12: "Struggling Socioeconomic w/ High NR Relevance & Non-Existent Promo Relevance"
}

In [None]:
def plot_analysis(df, column_name, agg_method='flat_mean'):
    df['SEGM_CLUST'] = df['SEGM_CLUST'].astype(int)
    # Special case for SUB_BILLING_ID
    if column_name == "SUB_BILLING_ID":
        if agg_method == 'rel_perc_clust':
            # Group by SEGM_CLUST and SNAPSHOT_DATE and count the number of SUB_BILLING_IDs
            count_df = df.groupby(['SEGM_CLUST', 'SNAPSHOT_DATE'])[column_name].count().reset_index(name='count')
            # Calculate the total number of SUB_BILLING_IDs for each SNAPSHOT_DATE
            total_count_df = df.groupby('SNAPSHOT_DATE')[column_name].count().reset_index(name='total_count')
            # Merge the count_df with total_count_df on SNAPSHOT_DATE
            result = pd.merge(count_df, total_count_df, on='SNAPSHOT_DATE')
            # Calculate the proportion
            result['proportion'] = result['count'] / result['total_count']
        elif agg_method == 'flat_count':
            # Group by SEGM_CLUST and SNAPSHOT_DATE and count the number of SUB_BILLING_IDs
            result = df.groupby(['SEGM_CLUST', 'SNAPSHOT_DATE'])[column_name].count().reset_index(name='count')
    else:
        # Check if the column is numeric or categorical
        if pd.api.types.is_numeric_dtype(df[column_name]):
            if agg_method == 'flat_mean':
                # Group by SEGM_CLUST and SNAPSHOT_DATE and calculate the mean
                result = df.groupby(['SEGM_CLUST', 'SNAPSHOT_DATE'])[column_name].mean().reset_index(name='mean')
            elif agg_method == 'flat_sum':
                # Group by SEGM_CLUST and SNAPSHOT_DATE and calculate the sum
                result = df.groupby(['SEGM_CLUST', 'SNAPSHOT_DATE'])[column_name].sum().reset_index(name='sum')
            elif agg_method == 'rel_perc_clust':
                # Group by SEGM_CLUST and SNAPSHOT_DATE and count the number of SUB_BILLING_IDs
                count_df = df.groupby(['SEGM_CLUST', 'SNAPSHOT_DATE'])[column_name].sum().reset_index(name='count')
                # Calculate the total number of SUB_BILLING_IDs for each SNAPSHOT_DATE
                total_count_df = df.groupby(['SEGM_CLUST', 'SNAPSHOT_DATE'])[column_name].count().reset_index(name='total_count')
                # Merge the count_df with total_count_df on SNAPSHOT_DATE
                result = pd.merge(count_df, total_count_df, on=['SEGM_CLUST', 'SNAPSHOT_DATE'])
                # Calculate the proportion
                result['proportion'] = result['count'] / result['total_count']
            result['is_numeric'] = True
        else:
            if agg_method == 'flat_count':
                # Group by SEGM_CLUST, SNAPSHOT_DATE, and column_name and perform categorical frequency count
                result = df.groupby(['SEGM_CLUST', 'SNAPSHOT_DATE', column_name]).size().reset_index(name='count')
            elif agg_method == 'rel_perc_clust':
                # Group by SEGM_CLUST, SNAPSHOT_DATE, and column_name and perform categorical frequency count
                count_df = df.groupby(['SEGM_CLUST', 'SNAPSHOT_DATE', column_name]).size().reset_index(name='count')
                # Calculate the total count for each SEGM_CLUST and SNAPSHOT_DATE
                total_count_df = df.dropna(subset=[column_name]).groupby(['SEGM_CLUST', 'SNAPSHOT_DATE']).size().reset_index(name='total_count')
                # Merge the count_df with total_count_df on SEGM_CLUST and SNAPSHOT_DATE
                result = pd.merge(count_df, total_count_df, on=['SEGM_CLUST', 'SNAPSHOT_DATE'], how = 'left')
                # Calculate the proportion
                result['proportion'] = result['count'] / result['total_count']
            result['is_numeric'] = False
    #print("RESULT DF: ")
    #print(result.head(n = 15))
    #print("")
    
    sns.set(style="whitegrid")
    
    # Plot
    if column_name == "SUB_BILLING_ID":
        # Define a consistent color palette
        unique_values = sorted(result["SEGM_CLUST"].unique())
        palette = sns.color_palette("husl", len(unique_values))
        color_mapping = dict(zip(unique_values, palette))

        # Plot Stacked Bar for Counts
        if agg_method == 'flat_count':
            plt.figure(figsize=(14, 8))
            ax = sns.histplot(data=result, x='SNAPSHOT_DATE', weights="count", hue='SEGM_CLUST', multiple='stack', bins=len(result['SNAPSHOT_DATE'].unique()), discrete = True, palette=color_mapping, shrink=8.0)
            plt.title(f'GA Subs: {column_name} Per Segment, Flat Counts')
            plt.xlabel('Snapshot Date')
            plt.ylabel('Subscriber Count')
            ax = plt.gca()
            # Format x-axis labels to show only date in 'YYYY-MM-DD' format
            # Add x-axis ticks at each actual SNAPSHOT_DATE and angle the labels 45 degrees
            ax.set_xticks(result['SNAPSHOT_DATE'].unique())
            # Format x-axis labels to show only date in 'YYYY-MM-DD' format
            ax.set_xticklabels([pd.to_datetime(label).strftime('%Y-%m-%d') for label in result['SNAPSHOT_DATE'].unique()], rotation=45, ha='right')
            # Set y-axis to display full numbers instead of scientific notation
            ax.yaxis.set_major_formatter(ScalarFormatter())
            ax.ticklabel_format(style='plain', axis='y')
            # Move legend to the right side, outside of the graph and increase font size
            handles = [plt.Rectangle((0,0),1,1, color=color_mapping[val]) for val in unique_values]
            labels = [cluster_descriptions[int(val)] for val in unique_values]
            ax.legend(handles, labels, title=column_name, bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
            plt.tight_layout()
            plt.show()

        # Plot Line Plot for Counts and Proportions
        plt.figure(figsize=(14, 8))
        if agg_method == 'rel_perc_clust':
            #print(result.head(n=20))
            sns.lineplot(data=result, x='SNAPSHOT_DATE', y='proportion', hue='SEGM_CLUST', marker='o', palette=color_mapping)
            plt.title(f'GA Subs: {column_name} Per Segment, Relative To All Subs')
        elif agg_method == 'flat_count':
            sns.lineplot(data=result, x='SNAPSHOT_DATE', y='count', hue='SEGM_CLUST', marker='o', palette=color_mapping)
            plt.title(f'GA Subs: {column_name} Per Segment, Flat Counts')
        # Move legend to the right side, outside of the graph and increase font size
        plt.xlabel('Snapshot Date')
        plt.ylabel('Proportion of Subscribers' if agg_method == 'rel_perc_clust' else 'Subscriber Count')
        ax = plt.gca()
        handles, labels = ax.get_legend_handles_labels()
        labels = [cluster_descriptions[int(label)] for label in labels]
        ax.legend(handles, labels, title='Segment Descriptions', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
        # Add x-axis ticks at each actual SNAPSHOT_DATE and angle the labels 45 degrees
        ax.set_xticks(result['SNAPSHOT_DATE'].unique())
        # Format x-axis labels to show only date in 'YYYY-MM-DD' format
        ax.set_xticklabels([pd.to_datetime(label).strftime('%Y-%m-%d') for label in result['SNAPSHOT_DATE'].unique()], rotation=45, ha='right')
        plt.tight_layout()
        plt.show()

    # Check if the analysis was performed on a numeric column
    elif result['is_numeric'].iloc[0]:
        unique_values = sorted(result["SEGM_CLUST"].unique())
        palette = sns.color_palette("husl", len(unique_values))
        color_mapping = dict(zip(unique_values, palette))
        plt.figure(figsize=(14, 8))
        # Plot stacked bar graph using seaborn with increased size
        if agg_method != 'flat_mean':
            if agg_method == 'flat_sum':
                ax = sns.histplot(data=result, x='SNAPSHOT_DATE', weights="sum", hue='SEGM_CLUST', multiple='stack', bins=len(result['SNAPSHOT_DATE'].unique()), discrete = True, palette=color_mapping, shrink=8.0)
                plt.title(f'GA Subs: {column_name} Per Segment, Flat Sums')
                plt.xlabel('Snapshot Date')
                plt.ylabel('Proportion of Subscribers' if agg_method == 'rel_perc_clust' else ('Subscriber Count' if column_name == "SUB_BILLING_ID" else ('Mean Value' if agg_method == 'flat_mean' else 'Sum Value')))
                ax = plt.gca()
                # Format x-axis labels to show only date in 'YYYY-MM-DD' format
                # Add x-axis ticks at each actual SNAPSHOT_DATE and angle the labels 45 degrees
                ax.set_xticks(result['SNAPSHOT_DATE'].unique())
                # Format x-axis labels to show only date in 'YYYY-MM-DD' format
                ax.set_xticklabels([pd.to_datetime(label).strftime('%Y-%m-%d') for label in result['SNAPSHOT_DATE'].unique()], rotation=45, ha='right')
                # Set y-axis to display full numbers instead of scientific notation
                ax.yaxis.set_major_formatter(ScalarFormatter())
                ax.ticklabel_format(style='plain', axis='y')
                # Move legend to the right side, outside of the graph and increase font size
                handles = [plt.Rectangle((0,0),1,1, color=color_mapping[val]) for val in unique_values]
                labels = [cluster_descriptions[int(val)] for val in unique_values]
                ax.legend(handles, labels, title=column_name, bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
                plt.tight_layout()
                plt.show()
        
        # Plot line graph using seaborn with increased size
        plt.figure(figsize=(14, 8))        
        if agg_method == 'flat_mean':
            sns.lineplot(data=result, x='SNAPSHOT_DATE', y='mean', hue='SEGM_CLUST', marker='o', palette=color_mapping)
            plt.title(f'GA Subs: {column_name} Per Segment, Flat Mean Values')
            plt.ylabel('Avg. in Subscribers')
        elif agg_method == 'flat_sum':
            sns.lineplot(data=result, x='SNAPSHOT_DATE', y='sum', hue='SEGM_CLUST', marker='o', palette=color_mapping)
            plt.title(f'GA Subs: {column_name} Per Segment, Flat Sums')
            plt.ylabel('Sum in Subscribers')
        elif agg_method == 'rel_perc_clust':
            #print(result.head(n=20))
            sns.lineplot(data=result, x='SNAPSHOT_DATE', y='proportion', hue='SEGM_CLUST', marker='o', palette=color_mapping)
            plt.title(f'GA Subs: {column_name} Per Segment, Relative To Clusters')
            plt.ylabel('Proportion of Subscribers')
        else:
            plt.title(f'GA Subs: {column_name} Per Segment')
        plt.xlabel('Snapshot Date')
        # Move legend to the right side, outside of the graph and increase font size
        ax = plt.gca()
        handles, labels = ax.get_legend_handles_labels()
        labels = [cluster_descriptions[int(label)] for label in labels]
        ax.legend(handles, labels, title='Segment Descriptions', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
        # Add x-axis ticks at each actual SNAPSHOT_DATE and angle the labels 45 degrees
        ax.set_xticks(result['SNAPSHOT_DATE'].unique())
        # Format x-axis labels to show only date in 'YYYY-MM-DD' format
        ax.set_xticklabels([pd.to_datetime(label).strftime('%Y-%m-%d') for label in result['SNAPSHOT_DATE'].unique()], rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
    else:      
        # Define a consistent color palette for categorical columns
        unique_values = df[column_name].unique()
        palette = sns.color_palette("husl", len(unique_values))
        color_mapping = dict(zip(unique_values, palette))
        # Convert SEGM_CLUST back to integer for correct sorting
        result['SEGM_CLUST'] = result['SEGM_CLUST'].astype(int)
        # Plot charts for each SEGM_CLUST value using seaborn
        unique_clusters = sorted(result['SEGM_CLUST'].unique())
        for cluster in unique_clusters:
            cluster_df = result[result['SEGM_CLUST'] == cluster]
            # Plot Stacked Bar
            plt.figure(figsize=(14, 8))
            if agg_method == 'flat_count':
                ax = sns.histplot(data=cluster_df, x='SNAPSHOT_DATE', weights='count', hue=column_name, multiple='stack', bins=len(cluster_df['SNAPSHOT_DATE'].unique()), discrete=True, palette=color_mapping, shrink=8.0)            
                plt.ylabel('Subscriber Count')
                plt.title(f'GA Subs: {column_name} Flat Counts for Segment {cluster} ({cluster_descriptions[cluster]})')
            else:
                ax = sns.histplot(data=cluster_df, x='SNAPSHOT_DATE', weights='proportion', hue=column_name, multiple='stack', bins=len(cluster_df['SNAPSHOT_DATE'].unique()), discrete=True, palette=color_mapping, shrink=8.0)
                plt.ylabel('Proportion of Subscribers')
                plt.title(f'GA Subs: {column_name} Proportions for Segment {cluster} ({cluster_descriptions[cluster]})')
            plt.xlabel('Snapshot Date')
            # Format x-axis labels to show only date in 'YYYY-MM-DD' format
            ax.set_xticks(cluster_df['SNAPSHOT_DATE'].unique())
            ax.set_xticklabels([pd.to_datetime(label).strftime('%Y-%m-%d') for label in cluster_df['SNAPSHOT_DATE'].unique()], rotation=45, ha='right')
            # Manually create legend handles and labels
            handles = [plt.Rectangle((0,0),1,1, color=color_mapping[val]) for val in unique_values]
            labels = unique_values
            ax.legend(handles, labels, title=column_name, bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
            plt.tight_layout()
            plt.show()

            # Plot Line Bar for Counts
            plt.figure(figsize=(14, 8))
            if agg_method == 'flat_count':
                ax = sns.lineplot(data=cluster_df, x='SNAPSHOT_DATE', y='count', hue=column_name, marker='o', palette=color_mapping)            
                plt.ylabel('Subscriber Count')
                plt.title(f'GA Subs: {column_name} Flat Counts for Segment {cluster} ({cluster_descriptions[cluster]})')
            else:
                ax = sns.lineplot(data=cluster_df, x='SNAPSHOT_DATE', y='proportion', hue=column_name, marker='o', palette=color_mapping)
                plt.ylabel('Proportion of Subscribers')
                plt.title(f'GA Subs: {column_name} Proportions for Segment {cluster} ({cluster_descriptions[cluster]})')
            plt.xlabel('Snapshot Date')
            # Format x-axis labels to show only date in 'YYYY-MM-DD' format
            ax.set_xticks(cluster_df['SNAPSHOT_DATE'].unique())
            ax.set_xticklabels([pd.to_datetime(label).strftime('%Y-%m-%d') for label in cluster_df['SNAPSHOT_DATE'].unique()], rotation=45, ha='right')
            # Manually create legend handles and labels
            handles = [plt.Rectangle((0,0),1,1, color=color_mapping[val]) for val in unique_values]
            labels = unique_values
            ax.legend(handles, labels, title=column_name, bbox_to_anchor=(1.05, 1), loc='upper left', fontsize='small')
            plt.tight_layout()
            plt.show()

In [None]:
def run_prod_mix_analysis(df, count_type="flat_count"):
    # Convert SEGM_CLUST to integer type for correct sorting
    df['SEGM_CLUST'] = df['SEGM_CLUST'].astype(int)
    
    unique_clusters = sorted(df['SEGM_CLUST'].unique())
    
    # Define colors for each PROD_DURATION_MONTHS and PROD_DATA_GROUP combination
    color_dict = {
        "SMALL": {
            "1": "#e1bdad",
            "3": "#b5e5c2",
            "6": "#aed6e1",
            "12": "#dab7e1"
        },
        "MEDIUM": {
            "1": "#e19c7d",
            "3": "#7ee399",
            "6": "#87cee1",
            "12": "#d184e1"
        },
        "LARGE": {
            "1": "#e57545",
            "3": "#47e171",
            "6": "#44c0e1",
            "12": "#c745e1"
        },
        "UNLIMITED": {
            "1": "#e85b1e",
            "3": "#17e14e",
            "6": "#18b5df",
            "12": "#be1adf"
        }
    }

    # Define the desired order for PROD_DATA_GROUP and PROD_DURATION_MONTHS
    prod_data_group_order = ['SMALL', 'MEDIUM', 'LARGE', 'UNLIMITED']
    prod_duration_months_order = ["1", "3", "6", "12"]
    
    # Create a custom sorting function
    def custom_sort_key(col):
        duration, group = col
        return (prod_duration_months_order.index(duration), prod_data_group_order.index(group))
    
    for cluster in unique_clusters:
        cluster_df = df[df['SEGM_CLUST'] == cluster]

        # Pivot the dataframe to get the counts of PROD_DATA_GROUP for each SNAPSHOT_DATE and PROD_DURATION_MONTHS
        pivot_df = cluster_df.pivot_table(index='SNAPSHOT_DATE', columns=['PROD_DURATION_MONTHS', 'PROD_DATA_GROUP'], values='SUB_BILLING_ID', aggfunc='count', fill_value=0)
        
        if count_type == "rel_perc_clust":
            # Calculate the total non-null count within the whole SEGM_CLUST and SNAPSHOT_DATE groupby
            total_counts = cluster_df.groupby(['SNAPSHOT_DATE'])['SUB_BILLING_ID'].count()
            pivot_df = pivot_df.div(total_counts, axis=0)
            # Sort the columns based on the custom sorting function
            pivot_df = pivot_df[sorted(pivot_df.columns, key=custom_sort_key)]
            # Coerce SNAPSHOT_DATE to string format showing only yyyy-mm-dd
            pivot_df.index = pivot_df.index.strftime('%Y-%m-%d')
            
            # Plot the line graph
            plt.figure(figsize=(12, 8))
            for col in pivot_df.columns:
                plt.plot(pivot_df.index, pivot_df[col], label=f'{col[0]} {col[1]}', color=color_dict[col[1]][str(col[0])], marker='o')
        else:       
            # Sort the columns based on the custom sorting function
            pivot_df = pivot_df[sorted(pivot_df.columns, key=custom_sort_key)]
            # Coerce SNAPSHOT_DATE to string format showing only yyyy-mm-dd
            pivot_df.index = pivot_df.index.strftime('%Y-%m-%d')
            # Plot the stacked bar chart
            pivot_df.plot(kind='bar', stacked=True, figsize=(12, 8), color=[color_dict[col[1]][str(col[0])] for col in pivot_df.columns])

        ## DEBUG
        #print(pivot_df.head(n=20))
        ##
        
        # Update the title to display the cluster number and description
        plt.title(f'GA Subs: Product Mix Chart for Cluster {cluster}: {cluster_descriptions[cluster]}')
        plt.xlabel('Snapshot Date')
        plt.ylabel('Subscriber Count' if count_type == "flat_count" else 'Proportion of Subscribers')
        plt.xticks(rotation=45)
        
        # Set y-axis to display full numbers instead of scientific notation
        plt.gca().yaxis.set_major_formatter(ScalarFormatter())
        plt.gca().ticklabel_format(style='plain', axis='y')
        
        # Offset the legend to be on the side
        handles, labels = plt.gca().get_legend_handles_labels()
        by_label = dict(zip(labels, handles))
        plt.legend(by_label.values(), by_label.keys(), title='Product Mix (Duration, Size)', bbox_to_anchor=(1.05, 1), loc='upper left')
        
        plt.tight_layout()
        plt.show()

# A. ACTUAL COUNTS IN SEGMENTS
# -------------------------------

# GA TOTAL SUBS COUNT

In [None]:
plot_analysis(df, "SUB_BILLING_ID", 'flat_count')

# GA PRODUCT MIX

In [None]:
run_prod_mix_analysis(df, "flat_count")

# PROMO TYPE

In [None]:
plot_analysis(df, "PROMO_GROUPED", 'flat_count')

# GA EXPECTED CLV

In [None]:
plot_analysis(df, "EXPECTED_CLV_PS", 'flat_mean')

# PORT-IN %

In [None]:
plot_analysis(df, "PORTIN_FLAG", "flat_sum")

# PORT-IN SOURCES

In [None]:
plot_analysis(df, 'PORTIN_CMPT_GROUP_NAME_GROUPED', 'flat_count')

# GA SUBS THAT HAD ISSUES PORTING IN

In [None]:
plot_analysis(df, 'HAD_ISSUES_PORTING_IN', 'flat_sum')

# B. % COUNTS, SEGMENT vs SEGMENT
# ----------------------------------------

# GA TOTAL SUBS

In [None]:
plot_analysis(df, "SUB_BILLING_ID", 'rel_perc_clust')

# GA PRODUCT MIX

In [None]:
run_prod_mix_analysis(df, "rel_perc_clust")

# GA PROMO TYPE

In [None]:
plot_analysis(df, "PROMO_GROUPED", 'rel_perc_clust')

# PORT-IN %

In [None]:
plot_analysis(df, "PORTIN_FLAG", "rel_perc_clust")

# PORT-IN SOURCES

In [None]:
plot_analysis(df, 'PORTIN_CMPT_GROUP_NAME_GROUPED', 'rel_perc_clust')

# GA SUBS w/ ISSUES PORTING IN

In [None]:
plot_analysis(df, 'HAD_ISSUES_PORTING_IN', 'rel_perc_clust')