In [None]:
# Packages
import numpy as np
import pandas as pd
import snowflake.connector as snow
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import AgglomerativeClustering
#from collections import Counter

import umap

from sklearn.metrics import silhouette_score

import pickle
import time

In [None]:
# CONSTANTS & FUNCTIONS Section

# CONSTANTS
ORIGIN_COLS = ['ACTIVATING_SALE_GROUP_NAME_grouped',
'PROMO_GROUPED',
'GSMA_OPERATING_SYSTEM_grouped',
'ZERO_USAGE_LAST_30D_FLAG',
'SUB_AUTO_RENEWAL_FLAG',
'SUB_CREDIT_CARD_FLAG',
'PLAN_CYCLE_NUM_grouped',
'FAILED_PAYMENT_GROUPED',
'MEMBER_OF_ACTIVE_FAMILY_FLAG',
#'ACS_HP_PROP', 
#'ACS_NOT_HP_ASIAN_ALONE_PROP',
#'ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP',
#'ACS_NOT_HP_WHITE_ALONE_PROP',
#'ACS_PROP_WORKERS_OVER_16',
#'ACS_AGE_MEDIAN',
#'ACS_INCOME_MEDIAN',
#'ACS_APPROX_COMMUTE_MEDIAN',
#'NO_SCHOOL_PROP',
#'ANY_DEGREE_PROP',
#'SINGLE_MOM_PROP',
#'NEVER_MARRIED_PROP',
#'HH_WO_INT_ACCESS_PROP',
#'OCCUP_HOUS_UNIT_WO_CAR_PROP',
'UPGRADE_DOWNGRADE_DATA_FLAG', 
'UPGRADE_DOWNGRADE_DURATION_FLAG',
'SUB_ESIM_FLAG',
'PORTIN_FLAG',
#'HAD_ISSUES_PORTING_IN',
'EVER_LOGGED_INTO_APP_FLAG',
'YEAR_RELEASED_grouped',
'LTE_BAND_71',
#'contacted_care_last7d', 
'contacted_care_last30d',
'SERVICE_ISSUE_NOTE_FLAG', 
#'SIM_REPLACEMENT_NOTE_FLAG',
#'PAYMENT_NOTE_FLAG',
'TENURE_MONTHS',
'EXPECTED_CLV_PS',
'PROMO_FLAG',
'PORTIN_ISSUE_DESC',
'L1_CLUST'
]

# Demographic Info Only
ML_COLS_L1 = ['ACS_HP_PROP',
       'ACS_NOT_HP_ASIAN_ALONE_PROP',
       'ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP',
       'ACS_NOT_HP_WHITE_ALONE_PROP', 
       'ACS_PROP_WORKERS_OVER_16',
       'ACS_AGE_MEDIAN', 
       'ACS_INCOME_MEDIAN',
       'ACS_APPROX_COMMUTE_MEDIAN',
        'NO_SCHOOL_PROP',
        'ANY_DEGREE_PROP',
        'SINGLE_MOM_PROP',
        'NEVER_MARRIED_PROP',
        'HH_WO_INT_ACCESS_PROP',
        'OCCUP_HOUS_UNIT_WO_CAR_PROP']

# All Columns
ML_COLS_L3 = ['ACTIVATING_SALE_GROUP_NAME_grouped_Direct EComm',
    'ACTIVATING_SALE_GROUP_NAME_grouped_National Retail', 
    'PROMO_GROUPED_Deflation', 
    #'PROMO_GROUPED_Device bundle', 
    'PROMO_GROUPED_No Promo',
    'GSMA_OPERATING_SYSTEM_grouped_iOS',
    'PLAN_CYCLE_NUM_grouped', 
    'FAILED_PAYMENT_GROUPED',
    'MEMBER_OF_ACTIVE_FAMILY_FLAG', 
    #'ACS_HP_PROP',
    #'ACS_NOT_HP_ASIAN_ALONE_PROP',
    #'ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP',
    #'ACS_NOT_HP_WHITE_ALONE_PROP', 
    #'ACS_PROP_WORKERS_OVER_16',
    #'ACS_AGE_MEDIAN', 
    #'ACS_INCOME_MEDIAN', 
    #'ACS_APPROX_COMMUTE_MEDIAN', 
    #'NO_SCHOOL_PROP',
    #'ANY_DEGREE_PROP',
    #'SINGLE_MOM_PROP',
    #'NEVER_MARRIED_PROP',
    #'HH_WO_INT_ACCESS_PROP',
    #'OCCUP_HOUS_UNIT_WO_CAR_PROP',
    'UPGRADE_DOWNGRADE_DATA_FLAG', 
    'UPGRADE_DOWNGRADE_DURATION_FLAG',
    'SUB_ESIM_FLAG', 
    #'HAD_ISSUES_PORTING_IN', 
    'EVER_LOGGED_INTO_APP_FLAG',
    'LTE_BAND_71',
    #'contacted_care_last7d', 
    'contacted_care_last30d',
    'SERVICE_ISSUE_NOTE_FLAG',
    #'SIM_REPLACEMENT_NOTE_FLAG', 
    #'PAYMENT_NOTE_FLAG',
    'EXPECTED_CLV_PS',
    'PORTIN_ISSUE_DESC_NoIssues',
    'PORTIN_ISSUE_DESC_NonPortin',
    'L1_CLUST_0',
    #'L1_CLUST_1',
    'L1_CLUST_2',
    'L1_CLUST_3',
    'L1_CLUST_4',
    'L1_CLUST_5',
    'L1_CLUST_6'
]

# Columns to Encode
ENCODE_COLS = ['ACTIVATING_SALE_GROUP_NAME_grouped', 'PROMO_GROUPED', 'GSMA_OPERATING_SYSTEM_grouped', 'PORTIN_ISSUE_DESC', 'L1_CLUST']

# Columns to Stratify On
STRATIFY_COLS = ['BLK_GRP', 'PROMO_FLAG', 'PROMO_GROUPED', 'LTE_BAND_71', 'ACTIVATING_SALE_GROUP_NAME']

# FUNCTIONS
def categorize_sale_group(column):
    if 'Direct EComm' in str(column):
        return 'Direct EComm'
    if 'National Retail' in str(column):
        return 'National Retail'
    if 'Campus SIMs' in str(column):
        return 'Campus SIMs'
    else:
        return 'Other'

def categorize_device_year(column):
    try:
        # Attempt to convert the column to an integer
        column = int(float(column))
    except (ValueError, TypeError):
        # If conversion fails, set column to 0
        column = 0

    # Now perform the categorization
    if column == 2024:
        return 2024
    elif column == 2023:
        return 2023
    elif column == 2022:
        return 2022
    elif column == 2021:
        return 2021
    elif column == 2020:
        return 2020
    elif column == 0:  # accounting for majority of devices being Galaxy A53 5G
        return 2022
    else:
        return 2019

def categorize_os(column):
    if 'Android' in str(column):
        return 'Android'
    if 'KaiOS' in str(column):
        return 'Other'
    if 'iOS' in str(column):
        return 'iOS'
    else:
        return 'Other'

def categorize_plan_cycle_num(column):
    if column <= 3:
        return column
    else:
        return 4

# Fit/Transforms a Scalar object on provided df, and exports a scalar object
def min_max_scale_export(df, columns_to_scale, scalar_path):
    """
    Applies Min-Max scaling to specified columns of a DataFrame to a range between 0 and 1, 
    ensuring all values are non-negative.

    Parameters:
    - df: pandas DataFrame.
    - columns_to_scale: List of column names to apply Min-Max Scaling.

    Returns:
    - DataFrame with the specified columns scaled to the range between 0 and 1.
    """
    # Adjust feature_range to (0, 1) for scaling between 0 and 1
    scaler = MinMaxScaler(feature_range=(0, 1))
    
    # Apply MinMaxScaler to the specified columns
    df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
    
    # Save the scaler to a file using pickle
    with open(scalar_path, 'wb') as file:
        pickle.dump(scaler, file)
    
    return df[columns_to_scale]

# Applies a provided scalar object to the df
def min_max_scale_import(df, columns_to_scale, scalar_path):
    """
    Loads a saved scaler and applies it to specified columns of a DataFrame.

    Parameters:
    - df: pandas DataFrame.
    - columns_to_scale: List of column names to apply the saved scaler.
    - scaler_path: Path to the saved scaler object.

    Returns:
    - DataFrame with the specified columns scaled using the saved scaler.
    """
    # Load the scaler from the file using pickle
    with open(scalar_path, 'rb') as file:
        scaler = pickle.load(file)
    
    # Apply the loaded scaler to the specified columns
    df[columns_to_scale] = scaler.transform(df[columns_to_scale])
    
    return df[columns_to_scale]

# Fit/Transforms a UMAP Reducer object on provided df, and exports a UMAP Reducer object
def umap_export(df, columns_to_reduce, n_components, umap_path):
    """
    Perform UMAP dimensionality reduction on specified columns of a DataFrame and save the transformer.

    Parameters:
    df (pd.DataFrame): The DataFrame to perform UMAP on.
    columns_to_reduce (list): The subset of columns to reduce the dimensions of.
    n_components (int): The number of components to reduce to.
    umap_path (str): The path to save the UMAP transformer in pickle form.

    Returns:
    pd.DataFrame: The DataFrame with reduced dimensions.
    """
    # Select the specified columns to reduce
    data_to_reduce = df[columns_to_reduce]

    # Initialize UMAP with the desired number of components
    reducer = umap.UMAP(n_components=n_components)

    # Fit the UMAP model and transform the data
    embedding = reducer.fit_transform(data_to_reduce)

    # Save the trained UMAP model using pickle
    with open(umap_path, 'wb') as file:
        pickle.dump(reducer, file)

    # Convert the result to a DataFrame for easier handling
    embedding_df = pd.DataFrame(embedding, columns=[f'UMAP{i+1}' for i in range(n_components)])

    return embedding_df

# Applies a provided UMAP Reducer object to the df
def umap_import(df, columns_to_reduce, n_components, umap_path):
    """
    Perform UMAP dimensionality reduction on specified columns of a DataFrame and save the transformer.

    Parameters:
    df (pd.DataFrame): The DataFrame to perform UMAP on.
    columns_to_reduce (list): The subset of columns to reduce the dimensions of.
    n_components (int): The number of components to reduce to.
    umap_path (str): The path to save the UMAP transformer in pickle form.

    Returns:
    pd.DataFrame: The DataFrame with reduced dimensions.
    """
    # Select the specified columns to reduce
    data_to_reduce = df[columns_to_reduce]

    # Initialize UMAP with the desired number of components
    reducer = umap.UMAP(n_components=n_components)

    # Fit the UMAP model and transform the data
    embedding = reducer.fit_transform(data_to_reduce)

    # Save the trained UMAP model using pickle
    with open(umap_path, 'wb') as file:
        pickle.dump(reducer, file)

    # Convert the result to a DataFrame for easier handling
    embedding_df = pd.DataFrame(embedding, columns=[f'UMAP{i+1}' for i in range(n_components)])

    return embedding_df

# Define Pre-processing Steps
def hierPreprocess(df):
    ## Impute ACS Statistic Measures where NA
    # Age
    df['ACS_AGE_MEDIAN'] = df['TOTAL_POP_MEDIAN_AGE'].fillna(df['ACS_APPROX_AGE_MEDIAN'])
    df['ACS_AGE_MEDIAN'] = df['ACS_AGE_MEDIAN'].fillna(df['ACS_AGE_MEDIAN'].median())
    # Income
    df['ACS_INCOME_MEDIAN'] = df['ACS_APPROX_INCOME_MEDIAN'].fillna(df['ACS_APPROX_INCOME_MEDIAN'].median())
    # Commute
    df['ACS_APPROX_COMMUTE_MEDIAN'] = df['ACS_APPROX_COMMUTE_MEDIAN'].fillna(df['ACS_APPROX_COMMUTE_MEDIAN'].median())
    # Ethnicities
    df['ACS_HP_PROP'] = df['ACS_HP_PROP'].fillna(df['ACS_HP_PROP'].median())
    df['ACS_NOT_HP_ASIAN_ALONE_PROP'] = df['ACS_NOT_HP_ASIAN_ALONE_PROP'].fillna(df['ACS_NOT_HP_ASIAN_ALONE_PROP'].median())
    df['ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP'] = df['ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP'].fillna(df['ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP'].median())
    df['ACS_NOT_HP_WHITE_ALONE_PROP'] = df['ACS_NOT_HP_WHITE_ALONE_PROP'].fillna(df['ACS_NOT_HP_WHITE_ALONE_PROP'].median())
    # Proportion of Workers
    df['ACS_PROP_WORKERS_OVER_16'] = df['ACS_PROP_WORKERS_OVER_16'].fillna(df['ACS_PROP_WORKERS_OVER_16'].median())

    # Use Categorization/Bucketing Functions
    df['ACTIVATING_SALE_GROUP_NAME_grouped'] = df['ACTIVATING_SALE_GROUP_NAME'].map(categorize_sale_group)
    df['YEAR_RELEASED_grouped'] = df['YEAR_RELEASED'].map(categorize_device_year)
    df['GSMA_OPERATING_SYSTEM_grouped'] = df['GSMA_OPERATING_SYSTEM'].map(categorize_os)
    df['PLAN_CYCLE_NUM_grouped'] = df['PLAN_CYCLE_NUM'].map(categorize_plan_cycle_num)
    df['contacted_care_last7d'] = np.where(df['CNT_NOTES_LAST_7D']>0,1,0)
    df['contacted_care_last30d'] = np.where(df['CNT_NOTES_LAST_30D']>0,1,0)
    
    # Function to determine PORTIN_ISSUE_DESC
    def determine_issue_desc(row):
        if row['PORTIN_FLAG'] == 1 and row['HAD_ISSUES_PORTING_IN'] == 1:
            return 'HadIssues'
        elif row['PORTIN_FLAG'] == 1 and row['HAD_ISSUES_PORTING_IN'] == 0:
            return 'NoIssues'
        elif row['PORTIN_FLAG'] == 0:
            return 'NonPortin'
        else:
            return 'UnKnown'  # In case there are other combinations
    # Apply the function to create the new column
    df['PORTIN_ISSUE_DESC'] = df.apply(determine_issue_desc, axis=1)

    return df

def coerceDataTypes_small(df):
    print("Memory Used Before Coercing: ")
    print(df.memory_usage(deep=True).sum()/1000/1000/1000)
    
    #df['VOLTE_USAGE_LAST_7D']=df['VOLTE_USAGE_LAST_7D'].astype('int8')
    df['PROMO_GROUPED_Deflation']=df['PROMO_GROUPED_Deflation'].astype('int8')
    df['ACTIVATING_SALE_GROUP_NAME_grouped_National Retail']=df['ACTIVATING_SALE_GROUP_NAME_grouped_National Retail'].astype('int8')
    #df['ACS_INCOME_MEDIAN']=df['ACS_INCOME_MEDIAN'].astype('float16')
    df['PLAN_CYCLE_NUM_grouped']=df['PLAN_CYCLE_NUM_grouped'].astype('int8')
    df['FAILED_PAYMENT_GROUPED']=df['FAILED_PAYMENT_GROUPED'].astype('int8')  
    df['EXPECTED_CLV_PS']=df['EXPECTED_CLV_PS'].astype('float16')   
    df['MEMBER_OF_ACTIVE_FAMILY_FLAG']=df['MEMBER_OF_ACTIVE_FAMILY_FLAG'].astype('int8')
    #df['ACS_HP_PROP']=df['ACS_HP_PROP'].astype('float16')
    #df['ACS_NOT_HP_ASIAN_ALONE_PROP']=df['ACS_NOT_HP_ASIAN_ALONE_PROP'].astype('float16')
    #df['ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP']=df['ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP'].astype('float16')
    #df['ACS_NOT_HP_WHITE_ALONE_PROP']=df['ACS_NOT_HP_WHITE_ALONE_PROP'].astype('float16')
    #df['ACS_PROP_WORKERS_OVER_16']=df['ACS_PROP_WORKERS_OVER_16'].astype('float16')
    #df['ACS_AGE_MEDIAN']=df['ACS_AGE_MEDIAN'].astype('float16')
    #df['ACS_APPROX_COMMUTE_MEDIAN']=df['ACS_APPROX_COMMUTE_MEDIAN'].astype('float16')
    df['SUB_ESIM_FLAG']=df['SUB_ESIM_FLAG'].astype('int8')
    #df['HAD_ISSUES_PORTING_IN']=df['HAD_ISSUES_PORTING_IN'].astype('int8')
    df['PORTIN_ISSUE_DESC_NoIssues']=df['PORTIN_ISSUE_DESC_NoIssues'].astype('int8')
    df['PORTIN_ISSUE_DESC_NonPortin']=df['PORTIN_ISSUE_DESC_NonPortin'].astype('int8')
    df['EVER_LOGGED_INTO_APP_FLAG']=df['EVER_LOGGED_INTO_APP_FLAG'].astype('int8')
    df['LTE_BAND_71']=df['LTE_BAND_71'].astype('int8')
    df['UPGRADE_DOWNGRADE_DATA_FLAG']=df['UPGRADE_DOWNGRADE_DATA_FLAG'].astype('int8')
    df['UPGRADE_DOWNGRADE_DURATION_FLAG']=df['UPGRADE_DOWNGRADE_DURATION_FLAG'].astype('int8')
    #df['contacted_care_last7d']=df['contacted_care_last7d'].astype('int8')
    df['contacted_care_last30d']=df['contacted_care_last30d'].astype('int8')
    df['SERVICE_ISSUE_NOTE_FLAG']=df['SERVICE_ISSUE_NOTE_FLAG'].astype('int8')
    #df['SIM_REPLACEMENT_NOTE_FLAG']=df['SIM_REPLACEMENT_NOTE_FLAG'].astype('int8')
    #df['PAYMENT_NOTE_FLAG']=df['PAYMENT_NOTE_FLAG'].astype('int8')
    df['ACTIVATING_SALE_GROUP_NAME_grouped_Direct EComm']=df['ACTIVATING_SALE_GROUP_NAME_grouped_Direct EComm'].astype('int8')
    #df['PROMO_GROUPED_Device bundle']=df['PROMO_GROUPED_Device bundle'].astype('int8')
    df['PROMO_GROUPED_No Promo']=df['PROMO_GROUPED_No Promo'].astype('int8')
    df['GSMA_OPERATING_SYSTEM_grouped_iOS']=df['GSMA_OPERATING_SYSTEM_grouped_iOS'].astype('int8')
    #df['NO_SCHOOL_PROP']=df['NO_SCHOOL_PROP'].astype('float16')
    #df['ANY_DEGREE_PROP']=df['ANY_DEGREE_PROP'].astype('float16')
    #df['SINGLE_MOM_PROP']=df['SINGLE_MOM_PROP'].astype('float16')
    #df['NEVER_MARRIED_PROP']=df['NEVER_MARRIED_PROP'].astype('float16')
    #df['HH_WO_INT_ACCESS_PROP']=df['HH_WO_INT_ACCESS_PROP'].astype('float16')
    #df['OCCUP_HOUS_UNIT_WO_CAR_PROP']=df['OCCUP_HOUS_UNIT_WO_CAR_PROP'].astype('float16')
    df['L1_CLUST_0']=df['L1_CLUST_0'].astype('int8')
    #df['L1_CLUST_1']=df['L1_CLUST_1'].astype('int8')
    df['L1_CLUST_2']=df['L1_CLUST_2'].astype('int8')
    df['L1_CLUST_3']=df['L1_CLUST_3'].astype('int8')
    df['L1_CLUST_4']=df['L1_CLUST_4'].astype('int8')
    df['L1_CLUST_5']=df['L1_CLUST_5'].astype('int8')
    df['L1_CLUST_6']=df['L1_CLUST_6'].astype('int8')

    print(df.dtypes)
    
    print("Memory Used After Coercing")
    print(df.memory_usage(deep=True).sum()/1000/1000/1000)
    print("")
    
    return df

def coerceDataTypes_med(df):
    print("Memory Used Before Coercing: ")
    print(df.memory_usage(deep=True).sum()/1000/1000/1000)
    
    #df['VOLTE_USAGE_LAST_7D']=df['VOLTE_USAGE_LAST_7D'].astype('int8')
    df['PROMO_GROUPED_Deflation']=df['PROMO_GROUPED_Deflation'].astype('int8')
    df['ACTIVATING_SALE_GROUP_NAME_grouped_National Retail']=df['ACTIVATING_SALE_GROUP_NAME_grouped_National Retail'].astype('int8')
    #df['ACS_INCOME_MEDIAN']=df['ACS_INCOME_MEDIAN'].astype('float32')
    df['PLAN_CYCLE_NUM_grouped']=df['PLAN_CYCLE_NUM_grouped'].astype('int8')
    df['FAILED_PAYMENT_GROUPED']=df['FAILED_PAYMENT_GROUPED'].astype('int8') 
    df['EXPECTED_CLV_PS']=df['EXPECTED_CLV_PS'].astype('float32')  
    df['MEMBER_OF_ACTIVE_FAMILY_FLAG']=df['MEMBER_OF_ACTIVE_FAMILY_FLAG'].astype('int8')
    #df['ACS_HP_PROP']=df['ACS_HP_PROP'].astype('float32')
    #df['ACS_NOT_HP_ASIAN_ALONE_PROP']=df['ACS_NOT_HP_ASIAN_ALONE_PROP'].astype('float32')
    #df['ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP']=df['ACS_NOT_HP_AFRICAN_AMERICAN_ALONE_PROP'].astype('float32')
    #df['ACS_NOT_HP_WHITE_ALONE_PROP']=df['ACS_NOT_HP_WHITE_ALONE_PROP'].astype('float32')
    #df['ACS_PROP_WORKERS_OVER_16']=df['ACS_PROP_WORKERS_OVER_16'].astype('float32')
    #df['ACS_AGE_MEDIAN']=df['ACS_AGE_MEDIAN'].astype('float32')
    #df['ACS_APPROX_COMMUTE_MEDIAN']=df['ACS_APPROX_COMMUTE_MEDIAN'].astype('float32')
    df['SUB_ESIM_FLAG']=df['SUB_ESIM_FLAG'].astype('int8')
    #df['HAD_ISSUES_PORTING_IN']=df['HAD_ISSUES_PORTING_IN'].astype('int8')
    df['PORTIN_ISSUE_DESC_NoIssues']=df['PORTIN_ISSUE_DESC_NoIssues'].astype('int8')
    df['PORTIN_ISSUE_DESC_NonPortin']=df['PORTIN_ISSUE_DESC_NonPortin'].astype('int8')
    df['EVER_LOGGED_INTO_APP_FLAG']=df['EVER_LOGGED_INTO_APP_FLAG'].astype('int8')
    df['LTE_BAND_71']=df['LTE_BAND_71'].astype('int8')
    df['UPGRADE_DOWNGRADE_DATA_FLAG']=df['UPGRADE_DOWNGRADE_DATA_FLAG'].astype('int8')
    df['UPGRADE_DOWNGRADE_DURATION_FLAG']=df['UPGRADE_DOWNGRADE_DURATION_FLAG'].astype('int8')
    #df['contacted_care_last7d']=df['contacted_care_last7d'].astype('int8')
    df['contacted_care_last30d']=df['contacted_care_last30d'].astype('int8')
    df['SERVICE_ISSUE_NOTE_FLAG']=df['SERVICE_ISSUE_NOTE_FLAG'].astype('int8')
    #df['SIM_REPLACEMENT_NOTE_FLAG']=df['SIM_REPLACEMENT_NOTE_FLAG'].astype('int8')
    #df['PAYMENT_NOTE_FLAG']=df['PAYMENT_NOTE_FLAG'].astype('int8')
    df['ACTIVATING_SALE_GROUP_NAME_grouped_Direct EComm']=df['ACTIVATING_SALE_GROUP_NAME_grouped_Direct EComm'].astype('int8')
    #df['PROMO_GROUPED_Device bundle']=df['PROMO_GROUPED_Device bundle'].astype('int8')
    df['PROMO_GROUPED_No Promo']=df['PROMO_GROUPED_No Promo'].astype('int8')
    df['GSMA_OPERATING_SYSTEM_grouped_iOS']=df['GSMA_OPERATING_SYSTEM_grouped_iOS'].astype('int8')
    #df['NO_SCHOOL_PROP']=df['NO_SCHOOL_PROP'].astype('float32')
    #df['ANY_DEGREE_PROP']=df['ANY_DEGREE_PROP'].astype('float32')
    #df['SINGLE_MOM_PROP']=df['SINGLE_MOM_PROP'].astype('float32')
    #df['NEVER_MARRIED_PROP']=df['NEVER_MARRIED_PROP'].astype('float32')
    #df['HH_WO_INT_ACCESS_PROP']=df['HH_WO_INT_ACCESS_PROP'].astype('float32')
    #df['OCCUP_HOUS_UNIT_WO_CAR_PROP']=df['OCCUP_HOUS_UNIT_WO_CAR_PROP'].astype('float32')
    df['L1_CLUST_0']=df['L1_CLUST_0'].astype('int8')
    #df['L1_CLUST_1']=df['L1_CLUST_1'].astype('int8')
    df['L1_CLUST_2']=df['L1_CLUST_2'].astype('int8')
    df['L1_CLUST_3']=df['L1_CLUST_3'].astype('int8')
    df['L1_CLUST_4']=df['L1_CLUST_4'].astype('int8')
    df['L1_CLUST_5']=df['L1_CLUST_5'].astype('int8')
    df['L1_CLUST_6']=df['L1_CLUST_6'].astype('int8')

    print(df.dtypes)
    
    print("Memory Used After Coercing")
    print(df.memory_usage(deep=True).sum()/1000/1000/1000)
    print("")
    
    return df

# Fixed Sample Ratio
DF_SAMPLEPROP = 0.10


In [None]:
# Connection Details
con = snow.connect(
    user="DESENSITIZED",
    server="DESENSITIZED",
    database="DESENSITIZED",
    warehouse="DESENSITIZED",
    authenticator="externalbrowser",
    account="DESENSITIZED"
)

In [None]:
# Create a cursor object.
cur = con.cursor()

jun_stratify_sql = '''
DESENSITIZED
'''

jun_subs_sql = '''
DESENSITIZED

'''

# Fetch the result set from the cursor and deliver it as the pandas DataFrame.
# Grab Dataframes
cur.execute(jun_stratify_sql)
jun_stratify_df = cur.fetch_pandas_all()

cur.execute(jun_subs_sql)
jun_subs_df = cur.fetch_pandas_all()

In [None]:
print(jun_subs_df.memory_usage(deep=True).sum()/1000/1000/1000)

In [None]:
# Check Data
jun_subs_df.head()

In [None]:
# Check Data
jun_subs_df.shape

In [None]:
# Simplify the Joins to turn them from 1:many to 1:1 to simplify things
jun_subs_df.drop_duplicates(subset='SUB_BILLING_ID', inplace = True)

In [None]:
# Check Data
jun_subs_df.shape

In [None]:
jun_stratify_df

In [None]:
# Get Stratified Sample out of the Overlapped Data
jun_stratify_sampled_df = jun_stratify_df.groupby(STRATIFY_COLS, group_keys=False).apply(lambda x: x.sample(frac=DF_SAMPLEPROP)).reset_index(drop=True)

In [None]:
jun_stratify_sampled_df.head()

In [None]:
jun_stratify_sampled_df.shape

In [None]:
# Drop Stratifying Columns
jun_stratify_sampled_df.drop(columns = STRATIFY_COLS, axis = 1, inplace = True)

In [None]:
jun_stratify_sampled_df.drop(columns = "SNAPSHOT_DATE", axis = 1, inplace = True)
jun_stratify_sampled_df.drop(columns = "ACS_BLOCK_GROUP", axis = 1, inplace = True)

In [None]:
jun_stratify_sampled_df.head()

In [None]:
jun_stratify_sampled_df.shape

In [None]:
# Get Overlap May and Jun, and get the data for May
jun_data_df = jun_stratify_sampled_df.merge(jun_subs_df, how = "left", on = "SUB_BILLING_ID")

In [None]:
jun_data_df.shape

In [None]:
# Export for Data Consistency
#jun_data_df.to_csv("jun_data_df.csv")

In [None]:
print(jun_data_df.memory_usage(deep=True).sum()/1000/1000/1000)

In [None]:
# Handle NAs
# Set the option to display more rows
pd.set_option('display.max_rows', 100)
# Check the number of missing values in each column
na_counts = jun_data_df.isna().sum()

# Sort the counts in descending order
sorted_na_counts = na_counts.sort_values(ascending=False)

# Display the result
print(sorted_na_counts)

In [None]:
# Handle NAs appropriately
jun_data_df['VOLTE_USAGE_LAST_7D'] = jun_data_df['VOLTE_USAGE_LAST_7D'].fillna(0)
jun_data_df['ZERO_USAGE_LAST_30D_FLAG'] = jun_data_df['ZERO_USAGE_LAST_30D_FLAG'].fillna(0)
jun_data_df['SERVICE_ISSUE_NOTE_FLAG'] = jun_data_df['SERVICE_ISSUE_NOTE_FLAG'].fillna(0)
jun_data_df['SIM_REPLACEMENT_NOTE_FLAG'] = jun_data_df['SIM_REPLACEMENT_NOTE_FLAG'].fillna(0)
jun_data_df['PAYMENT_NOTE_FLAG'] = jun_data_df['PAYMENT_NOTE_FLAG'].fillna(0)
jun_data_df['LTE_BAND_71'] = jun_data_df['LTE_BAND_71'].fillna(1)
jun_data_df['HAD_ISSUES_PORTING_IN'] = jun_data_df['HAD_ISSUES_PORTING_IN'].fillna(0)
jun_data_df['EXPECTED_CLV_PS'] = jun_data_df['EXPECTED_CLV_PS'].fillna(0)
jun_data_df['UPGRADE_DOWNGRADE_DATA_FLAG'] = jun_data_df['UPGRADE_DOWNGRADE_DATA_FLAG'].fillna(0)
jun_data_df['CNT_NOTES_LAST_30D'] = jun_data_df['CNT_NOTES_LAST_30D'].fillna(0)
jun_data_df['CNT_NOTES_LAST_7D'] = jun_data_df['CNT_NOTES_LAST_7D'].fillna(0)
jun_data_df['PLAN_CYCLE_NUM'] = jun_data_df['PLAN_CYCLE_NUM'].fillna(1)

In [None]:
# Handle NAs
# Set the option to display more rows
pd.set_option('display.max_rows', 100)
# Check the number of missing values in each column
na_counts = jun_data_df.isna().sum()

# Sort the counts in descending order
sorted_na_counts = na_counts.sort_values(ascending=False)

# Display the result
print(sorted_na_counts)

In [None]:
jun_data_df.shape

In [None]:
# Create Modelling Process
def performHierClustering(df_to_cluster_on, modelvar_cols_list, encode_cols_list, n_clusters, linkage_p, metric_p, clust_col_name, pp_exportdf_yn_name, mdl_exportdf_yn_name, scalar_obj_name, umap_obj_name):
    # Preprocess
    print("Beginning PreProcessing: ")
    print("")
    pp_df = hierPreprocess(df_to_cluster_on)
    print("Finished Preprocessing: ")
    print("")

    # Perform Drop Duplicates
    pp_df.drop_duplicates(inplace = True)

    # Create a Copy of the dataframe to preserve original form
    model_df = pp_df.copy()

    # Encode if necessary
    print("Beginning Encoding: ")
    print("")
    if ((isinstance(encode_cols_list, list)) and (len(encode_cols_list) > 0)):
        model_df = pd.get_dummies(model_df[ORIGIN_COLS], columns = encode_cols_list)
    else:
        model_df = model_df[ORIGIN_COLS]
    print("Finished Encoding: ")
    print("")
    
    print("Beginning Scaling: ")
    print("")
    # Scale on 0-1
    model_df = min_max_scale_export(model_df, modelvar_cols_list, scalar_obj_name)
    print("Finished Scaling: ")
    print("")

    # Coerce Data Types
    print("Beginning Data Type Coercion: ")
    print("")
    model_df = coerceDataTypes_small(model_df)
    print("Finished Data Type Coercion: ")
    print("")
    
    # Coerce Data Types
    #print("Beginning UMAP Transformation: ")
    #print("")
    #umap_df = umap_export(model_df, modelvar_cols_list, 20, umap_obj_name)
    umap_df = pd.DataFrame()
    #print("Finished UMAP Transformation: ")
    #print("")
    
    # Start the stopwatch
    start_time = time.time()
    # Model
    print("Beginning Modeling: ")
    print("")
    hier_model = AgglomerativeClustering(n_clusters = n_clusters, linkage = linkage_p, metric = metric_p).fit(model_df)
    print("Finished Modeling: ")
    print("")
    # Stop the stopwatch
    end_time = time.time()

    # Calculate the elapsed time
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time} seconds")

    print("Calculating Silhouette Score: ")
    print("")
    # Retain silhouette_score
    silhouetteScore = silhouette_score(model_df, hier_model.labels_) 
    print("Done Calculating Silhouette Score: ")
    print("")

    print("Getting Clusters In Column Form: ")
    print("")
    # Get Labels in List Form
    clusters_list_form = (hier_model.labels_).tolist()
    print("Done Getting Clusters In Column Form: ")
    print("")

    print("Appending Column Values: ")
    print("")
    # Append
    for df in [pp_df, model_df, umap_df]:
        df[clust_col_name] = clusters_list_form
        df["silhouette_score"] = silhouetteScore
    print("Done Appending Column Values: ")
    print("")

    print("Exporting CSVs if requested: ")
    print("")
    # Export if needed
    if pp_exportdf_yn_name[0] is True:
        pp_df.to_csv(pp_exportdf_yn_name[1])
    if mdl_exportdf_yn_name[0] is True:
        model_df.to_csv(mdl_exportdf_yn_name[1])
    print("Done Exporting CSVs if requested: ")
    print("")
    
    return df_to_cluster_on, pp_df, model_df, umap_df, clusters_list_form, silhouetteScore

In [None]:
# Model
jun_output = performHierClustering(jun_data_df, ML_COLS_L3, ENCODE_COLS, 14, linkage_p = "complete", metric_p = "euclidean", clust_col_name = "clust", 
                                   pp_exportdf_yn_name = [True, "jun_hclust14_10p_pp_output.csv"], mdl_exportdf_yn_name = [True, "jun_hclust14_10p_mdl_output.csv"], 
                                   scalar_obj_name = "jun_hclust14_scalar.pkl", umap_obj_name = "jun_hclust14_umap.pkl")

In [None]:
# Validation, Debug, Troubleshoot
#model_output[1] # Labels

In [None]:
#model_output[0].shape

In [None]:
#len(model_output[1])

In [None]:
# Get Count Distribution
#counts = Counter(model_output[1])
#print(counts)

In [None]:
#missing_count_list = sum(1 for item in model_output[1] if item is None or (isinstance(item, float) and np.isnan(item)))
#print(f"Number of missing values: {missing_count_list}")